From 83f4b9e954ab21fbedd2e8d7a15f5e47c91c6b6e Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 30 Mar 2016 17:11:10 +0200 Subject: [PATCH 01/89] Split IDNA into a separate crate. --- .gitignore | 4 +- Cargo.toml | 7 +- idna/Cargo.toml | 18 +++++ .../src/IdnaMappingTable.txt | 0 idna/src/lib.rs | 73 ++++++++++++++++++ .../src/make_uts46_mapping_table.py | 3 +- {src => idna/src}/punycode.rs | 0 src/idna.rs => idna/src/uts46.rs | 75 +++++++++---------- .../src/uts46_mapping_table.rs | 0 {tests => idna/tests}/IdnaTest.txt | 0 {tests => idna/tests}/punycode.rs | 11 ++- {tests => idna/tests}/punycode_tests.json | 0 idna/tests/tests.rs | 5 ++ tests/idna.rs => idna/tests/uts46.rs | 12 ++- src/lib.rs | 5 +- 15 files changed, 155 insertions(+), 58 deletions(-) create mode 100644 idna/Cargo.toml rename IdnaMappingTable.txt => idna/src/IdnaMappingTable.txt (100%) create mode 100644 idna/src/lib.rs rename make_idna_table.py => idna/src/make_uts46_mapping_table.py (95%) rename {src => idna/src}/punycode.rs (100%) rename src/idna.rs => idna/src/uts46.rs (84%) rename src/idna_mapping.rs => idna/src/uts46_mapping_table.rs (100%) rename {tests => idna/tests}/IdnaTest.txt (100%) rename {tests => idna/tests}/punycode.rs (79%) rename {tests => idna/tests}/punycode_tests.json (100%) create mode 100644 idna/tests/tests.rs rename tests/idna.rs => idna/tests/uts46.rs (89%) diff --git a/.gitignore b/.gitignore index 7cbe84a5..0284c25c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -/target -/Cargo.lock +target +Cargo.lock /.cargo/config diff --git a/Cargo.toml b/Cargo.toml index b2458c0a..9d965381 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,10 +16,6 @@ name = "format" [[test]] name = "form_urlencoded" [[test]] -name = "idna" -[[test]] -name = "punycode" -[[test]] name = "tests" [[test]] name = "wpt" @@ -50,8 +46,7 @@ version = ">=0.6.1, <0.8" optional = true [dependencies] +idna = { version = "0.1.0", path = "./idna" } uuid = { version = "0.2", features = ["v4"] } rustc-serialize = "0.3" -unicode-bidi = "0.2.3" -unicode-normalization = "0.1.2" matches = "0.1" diff --git a/idna/Cargo.toml b/idna/Cargo.toml new file mode 100644 index 00000000..04546507 --- /dev/null +++ b/idna/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "idna" +version = "0.1.0" +authors = ["Simon Sapin "] +description = "IDNA (Internationalizing Domain Names in Applications) and Punycode." +repository = "https://github.com/servo/rust-url/" +license = "MIT/Apache-2.0" + +[dependencies] +unicode-bidi = "0.2.3" +unicode-normalization = "0.1.2" +matches = "0.1" + +[dev-dependencies] +rustc-serialize = "0.3" + +[[test]] +name = "tests" diff --git a/IdnaMappingTable.txt b/idna/src/IdnaMappingTable.txt similarity index 100% rename from IdnaMappingTable.txt rename to idna/src/IdnaMappingTable.txt diff --git a/idna/src/lib.rs b/idna/src/lib.rs new file mode 100644 index 00000000..d53874f3 --- /dev/null +++ b/idna/src/lib.rs @@ -0,0 +1,73 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This Rust crate implements IDNA +//! [per the WHATWG URL Standard](https://url.spec.whatwg.org/#idna). +//! +//! It also exposes the underlying algorithms from [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) +//! and [Punycode (RFC 3492)](https://tools.ietf.org/html/rfc3492). +//! +//! Quoting from [UTS #46’s introduction](http://www.unicode.org/reports/tr46/#Introduction): +//! +//! > Initially, domain names were restricted to ASCII characters. +//! > A system was introduced in 2003 for internationalized domain names (IDN). +//! > This system is called Internationalizing Domain Names for Applications, +//! > or IDNA2003 for short. +//! > This mechanism supports IDNs by means of a client software transformation +//! > into a format known as Punycode. +//! > A revision of IDNA was approved in 2010 (IDNA2008). +//! > This revision has a number of incompatibilities with IDNA2003. +//! > +//! > The incompatibilities force implementers of client software, +//! > such as browsers and emailers, +//! > to face difficult choices during the transition period +//! > as registries shift from IDNA2003 to IDNA2008. +//! > This document specifies a mechanism +//! > that minimizes the impact of this transition for client software, +//! > allowing client software to access domains that are valid under either system. + +#[macro_use] extern crate matches; +extern crate unicode_bidi; +extern crate unicode_normalization; + +pub mod punycode; +pub mod uts46; + +/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm. +/// +/// Return the ASCII representation a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and using Punycode as necessary. +/// +/// This process may fail. +pub fn domain_to_ascii(domain: &str) -> Result { + uts46::to_ascii(domain, uts46::Flags { + use_std3_ascii_rules: false, + transitional_processing: true, // XXX: switch when Firefox does + verify_dns_length: false, + }) +} + +/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm. +/// +/// Return the Unicode representation of a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and decoding Punycode as necessary. +/// +/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation) +/// but always returns a string for the mapped domain. +pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) { + uts46::to_unicode(domain, uts46::Flags { + use_std3_ascii_rules: false, + + // Unused: + transitional_processing: true, + verify_dns_length: false, + }) +} diff --git a/make_idna_table.py b/idna/src/make_uts46_mapping_table.py similarity index 95% rename from make_idna_table.py rename to idna/src/make_uts46_mapping_table.py index 5700d680..8e090dc7 100644 --- a/make_idna_table.py +++ b/idna/src/make_uts46_mapping_table.py @@ -6,8 +6,7 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. - -# Run as: python make_idna_table.py idna_table.txt > src/idna_table.rs +# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs # You can get the latest idna table from # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt diff --git a/src/punycode.rs b/idna/src/punycode.rs similarity index 100% rename from src/punycode.rs rename to idna/src/punycode.rs diff --git a/src/idna.rs b/idna/src/uts46.rs similarity index 84% rename from src/idna.rs rename to idna/src/uts46.rs index e0efdb39..5f230e0e 100644 --- a/src/idna.rs +++ b/idna/src/uts46.rs @@ -1,6 +1,13 @@ -//! International domain names -//! -//! https://url.spec.whatwg.org/#idna +// Copyright 2013-2014 Valentin Gosu. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) use self::Mapping::*; use punycode; @@ -9,7 +16,7 @@ use unicode_normalization::UnicodeNormalization; use unicode_normalization::char::is_combining_mark; use unicode_bidi::{BidiClass, bidi_class}; -include!("idna_mapping.rs"); +include!("uts46_mapping_table.rs"); #[derive(Debug)] enum Mapping { @@ -23,9 +30,9 @@ enum Mapping { } struct Range { - pub from: char, - pub to: char, - pub mapping: Mapping, + from: char, + to: char, + mapping: Mapping, } fn find_char(codepoint: char) -> &'static Mapping { @@ -45,7 +52,7 @@ fn find_char(codepoint: char) -> &'static Mapping { &TABLE[min].mapping } -fn map_char(codepoint: char, flags: Uts46Flags, output: &mut String, errors: &mut Vec) { +fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec) { match *find_char(codepoint) { Mapping::Valid => output.push(codepoint), Mapping::Ignored => {}, @@ -185,7 +192,7 @@ fn passes_bidi(label: &str, transitional_processing: bool) -> bool { } /// http://www.unicode.org/reports/tr46/#Validity_Criteria -fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { +fn validate(label: &str, flags: Flags, errors: &mut Vec) { if label.nfc().ne(label.chars()) { errors.push(Error::ValidityCriteria); } @@ -212,7 +219,7 @@ fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { } /// http://www.unicode.org/reports/tr46/#Processing -fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> String { +fn processing(domain: &str, flags: Flags, errors: &mut Vec) -> String { let mut mapped = String::new(); for c in domain.chars() { map_char(c, flags, &mut mapped, errors) @@ -226,7 +233,7 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> if label.starts_with("xn--") { match punycode::decode_to_string(&label["xn--".len()..]) { Some(decoded_label) => { - let flags = Uts46Flags { transitional_processing: false, ..flags }; + let flags = Flags { transitional_processing: false, ..flags }; validate(&decoded_label, flags, errors); validated.push_str(&decoded_label) } @@ -241,14 +248,14 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> } #[derive(Copy, Clone)] -pub struct Uts46Flags { +pub struct Flags { pub use_std3_ascii_rules: bool, pub transitional_processing: bool, pub verify_dns_length: bool, } #[derive(PartialEq, Eq, Clone, Copy, Debug)] -pub enum Error { +enum Error { PunycodeError, ValidityCriteria, DissallowedByStd3AsciiRules, @@ -257,11 +264,18 @@ pub enum Error { TooLongForDns, } +/// Errors recorded during UTS #46 processing. +/// +/// This is opaque for now, only indicating the precense of at least one error. +/// More details may be exposed in the future. +#[derive(Debug)] +pub struct Errors(Vec); + /// http://www.unicode.org/reports/tr46/#ToASCII -pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result> { +pub fn to_ascii(domain: &str, flags: Flags) -> Result { let mut errors = Vec::new(); let mut result = String::new(); - for label in uts46_processing(domain, flags, &mut errors).split('.') { + for label in processing(domain, flags, &mut errors).split('.') { if result.len() > 0 { result.push('.'); } @@ -288,36 +302,21 @@ pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result Result> { - uts46_to_ascii(domain, Uts46Flags { - use_std3_ascii_rules: false, - transitional_processing: true, // XXX: switch when Firefox does - verify_dns_length: false, - }) -} - /// http://www.unicode.org/reports/tr46/#ToUnicode /// /// Only `use_std3_ascii_rules` is used in `flags`. -pub fn uts46_to_unicode(domain: &str, mut flags: Uts46Flags) -> (String, Vec) { +pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) { flags.transitional_processing = false; let mut errors = Vec::new(); - let domain = uts46_processing(domain, flags, &mut errors); + let domain = processing(domain, flags, &mut errors); + let errors = if errors.is_empty() { + Ok(()) + } else { + Err(Errors(errors)) + }; (domain, errors) } - -/// https://url.spec.whatwg.org/#concept-domain-to-unicode -pub fn domain_to_unicode(domain: &str) -> (String, Vec) { - uts46_to_unicode(domain, Uts46Flags { - use_std3_ascii_rules: false, - - // Unused: - transitional_processing: true, - verify_dns_length: false, - }) -} diff --git a/src/idna_mapping.rs b/idna/src/uts46_mapping_table.rs similarity index 100% rename from src/idna_mapping.rs rename to idna/src/uts46_mapping_table.rs diff --git a/tests/IdnaTest.txt b/idna/tests/IdnaTest.txt similarity index 100% rename from tests/IdnaTest.txt rename to idna/tests/IdnaTest.txt diff --git a/tests/punycode.rs b/idna/tests/punycode.rs similarity index 79% rename from tests/punycode.rs rename to idna/tests/punycode.rs index ae42b34d..0f660fed 100644 --- a/tests/punycode.rs +++ b/idna/tests/punycode.rs @@ -1,7 +1,12 @@ -extern crate url; -extern crate rustc_serialize; +// Copyright 2013 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. -use url::punycode::{decode, encode_str}; +use idna::punycode::{decode, encode_str}; use rustc_serialize::json::{Json, Object}; fn one_test(description: &str, decoded: &str, encoded: &str) { diff --git a/tests/punycode_tests.json b/idna/tests/punycode_tests.json similarity index 100% rename from tests/punycode_tests.json rename to idna/tests/punycode_tests.json diff --git a/idna/tests/tests.rs b/idna/tests/tests.rs new file mode 100644 index 00000000..087fcd39 --- /dev/null +++ b/idna/tests/tests.rs @@ -0,0 +1,5 @@ +extern crate idna; +extern crate rustc_serialize; + +mod punycode; +mod uts46; diff --git a/tests/idna.rs b/idna/tests/uts46.rs similarity index 89% rename from tests/idna.rs rename to idna/tests/uts46.rs index bb03f39d..4328e330 100644 --- a/tests/idna.rs +++ b/idna/tests/uts46.rs @@ -1,7 +1,13 @@ -extern crate url; +// Copyright 2013-2014 Valentin Gosu. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. use std::char; -use url::idna; +use idna::uts46; #[test] fn test_uts46() { @@ -35,7 +41,7 @@ fn test_uts46() { continue; } - let result = idna::uts46_to_ascii(&source, idna::Uts46Flags { + let result = uts46::to_ascii(&source, uts46::Flags { use_std3_ascii_rules: true, transitional_processing: test_type == "T", verify_dns_length: true, diff --git a/src/lib.rs b/src/lib.rs index 9caffad0..dcc3e9ed 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -141,8 +141,7 @@ extern crate serde; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; -extern crate unicode_normalization; -extern crate unicode_bidi; +extern crate idna; use std::fmt::{self, Formatter}; use std::str; @@ -170,9 +169,7 @@ mod parser; pub mod urlutils; pub mod percent_encoding; pub mod form_urlencoded; -pub mod punycode; pub mod format; -pub mod idna; /// The parsed representation of an absolute URL. #[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] From e0e60e9b1e8721a0a75a5aaf82f8ec517a68f274 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 30 Mar 2016 18:54:41 +0200 Subject: [PATCH 02/89] IDNA: One test in the test harness per test case. This makes individual failures show up separately. --- idna/Cargo.toml | 18 +++++++---- idna/tests/punycode.rs | 36 +++++++++++++-------- idna/tests/tests.rs | 20 ++++++++++++ idna/tests/uts46.rs | 73 +++++++++++++++++++++++------------------- 4 files changed, 94 insertions(+), 53 deletions(-) diff --git a/idna/Cargo.toml b/idna/Cargo.toml index 04546507..cc7a8d22 100644 --- a/idna/Cargo.toml +++ b/idna/Cargo.toml @@ -6,13 +6,19 @@ description = "IDNA (Internationalizing Domain Names in Applications) and Punyco repository = "https://github.com/servo/rust-url/" license = "MIT/Apache-2.0" -[dependencies] -unicode-bidi = "0.2.3" -unicode-normalization = "0.1.2" -matches = "0.1" +[lib] +doctest = false +test = false + +[[test]] +name = "tests" +harness = false [dev-dependencies] +rustc-test = "0.1" rustc-serialize = "0.3" -[[test]] -name = "tests" +[dependencies] +unicode-bidi = "0.2.3" +unicode-normalization = "0.1.2" +matches = "0.1" diff --git a/idna/tests/punycode.rs b/idna/tests/punycode.rs index 0f660fed..b72c0aba 100644 --- a/idna/tests/punycode.rs +++ b/idna/tests/punycode.rs @@ -8,15 +8,16 @@ use idna::punycode::{decode, encode_str}; use rustc_serialize::json::{Json, Object}; +use test::TestFn; -fn one_test(description: &str, decoded: &str, encoded: &str) { +fn one_test(decoded: &str, encoded: &str) { match decode(encoded) { None => panic!("Decoding {} failed.", encoded), Some(result) => { let result = result.into_iter().collect::(); assert!(result == decoded, - format!("Incorrect decoding of {}:\n {}\n!= {}\n{}", - encoded, result, decoded, description)) + format!("Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n", + encoded, result, decoded)) } } @@ -24,8 +25,8 @@ fn one_test(description: &str, decoded: &str, encoded: &str) { None => panic!("Encoding {} failed.", decoded), Some(result) => { assert!(result == encoded, - format!("Incorrect encoding of {}:\n {}\n!= {}\n{}", - decoded, result, encoded, description)) + format!("Incorrect encoding of \"{}\":\n \"{}\"\n!= \"{}\"\n", + decoded, result, encoded)) } } } @@ -38,17 +39,24 @@ fn get_string<'a>(map: &'a Object, key: &str) -> &'a str { } } -#[test] -fn test_punycode() { - +pub fn collect_tests(add_test: &mut F) { match Json::from_str(include_str!("punycode_tests.json")) { - Ok(Json::Array(tests)) => for test in &tests { + Ok(Json::Array(tests)) => for (i, test) in tests.into_iter().enumerate() { match test { - &Json::Object(ref o) => one_test( - get_string(o, "description"), - get_string(o, "decoded"), - get_string(o, "encoded") - ), + Json::Object(o) => { + let test_name = { + let desc = get_string(&o, "description"); + if desc.is_empty() { + format!("Punycode {}", i + 1) + } else { + format!("Punycode {}: {}", i + 1, desc) + } + }; + add_test(test_name, TestFn::dyn_test_fn(move || one_test( + get_string(&o, "decoded"), + get_string(&o, "encoded"), + ))) + } _ => panic!(), } }, diff --git a/idna/tests/tests.rs b/idna/tests/tests.rs index 087fcd39..0a4ad03e 100644 --- a/idna/tests/tests.rs +++ b/idna/tests/tests.rs @@ -1,5 +1,25 @@ extern crate idna; extern crate rustc_serialize; +extern crate test; mod punycode; mod uts46; + +fn main() { + let mut tests = Vec::new(); + { + let mut add_test = |name, run| { + tests.push(test::TestDescAndFn { + desc: test::TestDesc { + name: test::DynTestName(name), + ignore: false, + should_panic: test::ShouldPanic::No, + }, + testfn: run, + }) + }; + punycode::collect_tests(&mut add_test); + uts46::collect_tests(&mut add_test); + } + test::test_main(&std::env::args().collect::>(), tests) +} diff --git a/idna/tests/uts46.rs b/idna/tests/uts46.rs index 4328e330..038fdf45 100644 --- a/idna/tests/uts46.rs +++ b/idna/tests/uts46.rs @@ -8,11 +8,11 @@ use std::char; use idna::uts46; +use test::TestFn; -#[test] -fn test_uts46() { +pub fn collect_tests(add_test: &mut F) { // http://www.unicode.org/Public/idna/latest/IdnaTest.txt - for line in include_str!("IdnaTest.txt").lines() { + for (i, line) in include_str!("IdnaTest.txt").lines().enumerate() { if line == "" || line.starts_with("#") { continue } @@ -35,47 +35,54 @@ fn test_uts46() { let source = unescape(original); let to_unicode = pieces.remove(0); let to_ascii = pieces.remove(0); - let _nv8 = if pieces.len() > 0 { pieces.remove(0) } else { "" }; + let nv8 = if pieces.len() > 0 { pieces.remove(0) } else { "" }; if expected_failure { continue; } - let result = uts46::to_ascii(&source, uts46::Flags { - use_std3_ascii_rules: true, - transitional_processing: test_type == "T", - verify_dns_length: true, - }); + let test_name = format!("UTS #46 line {}", i + 1); + add_test(test_name, TestFn::dyn_test_fn(move || { + let result = uts46::to_ascii(&source, uts46::Flags { + use_std3_ascii_rules: true, + transitional_processing: test_type == "T", + verify_dns_length: true, + }); - if to_ascii.starts_with("[") { - if to_ascii.starts_with("[C") { - // http://unicode.org/reports/tr46/#Deviations - // applications that perform IDNA2008 lookup are not required to check for these contexts - continue; + if to_ascii.starts_with("[") { + if to_ascii.starts_with("[C") { + // http://unicode.org/reports/tr46/#Deviations + // applications that perform IDNA2008 lookup are not required to check + // for these contexts + return; + } + let res = result.ok(); + assert!(res == None, "Expected error. result: {} | original: {} | source: {}", + res.unwrap(), original, source); + return; } - let res = result.ok(); - assert!(res == None, "Expected error. result: {} | original: {} | source: {}", res.unwrap(), original, source); - continue; - } - let to_ascii = if to_ascii.len() > 0 { - to_ascii.to_string() - } else { - if to_unicode.len() > 0 { - to_unicode.to_string() + let to_ascii = if to_ascii.len() > 0 { + to_ascii.to_string() } else { - source.clone() - } - }; + if to_unicode.len() > 0 { + to_unicode.to_string() + } else { + source.clone() + } + }; - if _nv8 == "NV8" { - // This result isn't valid under IDNA2008. Skip it - continue; - } + if nv8 == "NV8" { + // This result isn't valid under IDNA2008. Skip it + return; + } - assert!(result.is_ok(), "Couldn't parse {} | original: {} | error: {:?}", source, original, result.err()); - let output = result.ok().unwrap(); - assert!(output == to_ascii, "result: {} | expected: {} | original: {} | source: {}", output, to_ascii, original, source); + assert!(result.is_ok(), "Couldn't parse {} | original: {} | error: {:?}", + source, original, result.err()); + let output = result.ok().unwrap(); + assert!(output == to_ascii, "result: {} | expected: {} | original: {} | source: {}", + output, to_ascii, original, source); + })) } } From 46acea98abf7924cc5cf0bb2ea84e41914e5371d Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 4 Dec 2015 19:24:23 +0100 Subject: [PATCH 03/89] Make it possible to define new encode sets in other crates. --- make_encode_sets.py | 42 ------ src/encode_sets.rs | 298 ---------------------------------------- src/percent_encoding.rs | 136 +++++++++++++----- 3 files changed, 100 insertions(+), 376 deletions(-) delete mode 100644 make_encode_sets.py delete mode 100644 src/encode_sets.rs diff --git a/make_encode_sets.py b/make_encode_sets.py deleted file mode 100644 index eb859050..00000000 --- a/make_encode_sets.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2013-2014 Simon Sapin. -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - - -# Run as: python make_encode_sets.py > src/encode_sets.rs - - -print('''\ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// Generated by make_encode_sets.py -''') -for name, encoded in [ - ('SIMPLE', ''), - ('QUERY', r''' "#<>'''), - ('DEFAULT', r''' "#<>`?{}'''), - ('USERINFO', r''' "#<>`?{}@'''), - ('PASSWORD', r''' "#<>`?{}@\/'''), - ('USERNAME', r''' "#<>`?{}@\/:'''), - ('FORM_URLENCODED', r''' !"#$%&\'()+,/:;<=>?@[\]^`{|}~'''), - ('HTTP_VALUE', r''' "%'()*,/:;<->?[\]{}'''), -]: - print( - "pub static %s: [&'static str; 256] = [\n%s\n];\n\n" - % (name, '\n'.join( - ' ' + ' '.join( - '"%s%s",' % ("\\" if chr(b) in '\\"' else "", chr(b)) - if 0x20 <= b <= 0x7E and chr(b) not in encoded - else '"%%%02X",' % b - for b in range(s, s + 8) - ) for s in range(0, 256, 8)))) diff --git a/src/encode_sets.rs b/src/encode_sets.rs deleted file mode 100644 index d7b5fb9d..00000000 --- a/src/encode_sets.rs +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// Generated by make_encode_sets.py - -pub static SIMPLE: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - " ", "!", "\"", "#", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "<", "=", ">", "?", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static QUERY: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "?", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static DEFAULT: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static USERINFO: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static PASSWORD: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "%5C", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static USERNAME: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "%5C", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static FORM_URLENCODED: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", - "%28", "%29", "*", "%2B", "%2C", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static HTTP_VALUE: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "#", "$", "%25", "&", "%27", - "%28", "%29", "%2A", "+", "%2C", "%2D", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", "%3B", "%3C", "=", "%3E", "%3F", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "%5B", "%5C", "%5D", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index ee11cc3d..79c17d0d 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -6,9 +6,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. - -#[path = "encode_sets.rs"] -mod encode_sets; +use std::ascii::AsciiExt; +use std::fmt::Write; /// Represents a set of characters / bytes that should be percent-encoded. /// @@ -21,51 +20,116 @@ mod encode_sets; /// In the query string however, a question mark does not have any special meaning /// and does not need to be percent-encoded. /// -/// Since the implementation details of `EncodeSet` are private, -/// the set of available encode sets is not extensible beyond the ones -/// provided here. -/// If you need a different encode set, -/// please [file a bug](https://github.com/servo/rust-url/issues) -/// explaining the use case. -#[derive(Copy, Clone)] -pub struct EncodeSet { - map: &'static [&'static str; 256], +/// A few sets are defined in this module. +/// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. +pub trait EncodeSet { + fn contains(&self, byte: u8) -> bool; } -/// This encode set is used for fragment identifier and non-relative scheme data. -pub static SIMPLE_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::SIMPLE }; +/// Define a new struct +/// that implements the [`EncodeSet`](percent_encoding/trait.EncodeSet.html) trait, +/// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html) +/// and related functions. +/// +/// Parameters are ASCII printable characters to include in the set +/// in addition to U+0000 to U+001F and above U+007F. +/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set). +/// +/// Example +/// ======= +/// +/// ```rust +/// #[macro_use] extern crate url; +/// define_encode_set! { +/// /// This encode set is used in the URL parser for query strings. +/// pub QUERY_ENCODE_SET = {' ', '"', '#', '<', '>'} +/// } +/// # fn main() { +/// assert_eq!(url::percent_encoding::percent_encode(b"foo bar", QUERY_ENCODE_SET), "foo%20bar"); +/// # } +/// ``` +#[macro_export] +macro_rules! define_encode_set { + ($(#[$attr: meta])* pub $name: ident = {$($ch: pat),*}) => { + $(#[$attr])* + #[derive(Copy, Clone)] + #[allow(non_camel_case_types)] + pub struct $name; + + impl $crate::percent_encoding::EncodeSet for $name { + fn contains(&self, byte: u8) -> bool { + match byte as char { + $( + $ch => true, + )* + _ => byte < 0x20 || byte > 0x7E + } + } + } + } +} -/// This encode set is used in the URL parser for query strings. -pub static QUERY_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::QUERY }; +define_encode_set! { + /// This encode set is used for fragment identifier and non-relative scheme data. + pub SIMPLE_ENCODE_SET = {} +} -/// This encode set is used for path components. -pub static DEFAULT_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::DEFAULT }; +define_encode_set! { + /// This encode set is used in the URL parser for query strings. + pub QUERY_ENCODE_SET = {' ', '"', '#', '<', '>'} +} -/// This encode set is used in the URL parser for usernames and passwords. -pub static USERINFO_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::USERINFO }; +define_encode_set! { + /// This encode set is used for path components. + pub DEFAULT_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}'} +} -/// This encode set should be used when setting the password field of a parsed URL. -pub static PASSWORD_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::PASSWORD }; +define_encode_set! { + /// This encode set is used in the URL parser for usernames and passwords. + pub USERINFO_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@'} +} -/// This encode set should be used when setting the username field of a parsed URL. -pub static USERNAME_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::USERNAME }; +define_encode_set! { + /// This encode set should be used when setting the password field of a parsed URL. + pub PASSWORD_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@', '\\', '/'} +} -/// This encode set is used in `application/x-www-form-urlencoded` serialization. -pub static FORM_URLENCODED_ENCODE_SET: EncodeSet = EncodeSet { - map: &encode_sets::FORM_URLENCODED, -}; +define_encode_set! { + /// This encode set should be used when setting the username field of a parsed URL. + pub USERNAME_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@', '\\', '/', ':'} +} -/// This encode set is used for HTTP header values and is defined at -/// https://tools.ietf.org/html/rfc5987#section-3.2 -pub static HTTP_VALUE_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::HTTP_VALUE }; +define_encode_set! { + /// This encode set is used in `application/x-www-form-urlencoded` serialization. + pub FORM_URLENCODED_ENCODE_SET = { + ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '+', ',', '/', ':', ';', + '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~' + } +} + +define_encode_set! { + /// This encode set is used for HTTP header values and is defined at + /// https://tools.ietf.org/html/rfc5987#section-3.2 + pub HTTP_VALUE = { + ' ', '"', '%', '\'', '(', ')', '*', ',', '/', ':', ';', '<', '-', '>', '?', + '[', '\\', ']', '{', '}' + } +} /// Percent-encode the given bytes, and push the result to `output`. /// /// The pushed strings are within the ASCII range. #[inline] -pub fn percent_encode_to(input: &[u8], encode_set: EncodeSet, output: &mut String) { +pub fn percent_encode_to(input: &[u8], encode_set: E, output: &mut String) { for &byte in input { - output.push_str(encode_set.map[byte as usize]) + if encode_set.contains(byte) { + write!(output, "%{:02X}", byte).unwrap(); + } else { + assert!(byte.is_ascii()); + unsafe { + output.as_mut_vec().push(byte) + } + } } } @@ -74,7 +138,7 @@ pub fn percent_encode_to(input: &[u8], encode_set: EncodeSet, output: &mut Strin /// /// The returned string is within the ASCII range. #[inline] -pub fn percent_encode(input: &[u8], encode_set: EncodeSet) -> String { +pub fn percent_encode(input: &[u8], encode_set: E) -> String { let mut output = String::new(); percent_encode_to(input, encode_set, &mut output); output @@ -85,7 +149,7 @@ pub fn percent_encode(input: &[u8], encode_set: EncodeSet) -> String { /// /// The pushed strings are within the ASCII range. #[inline] -pub fn utf8_percent_encode_to(input: &str, encode_set: EncodeSet, output: &mut String) { +pub fn utf8_percent_encode_to(input: &str, encode_set: E, output: &mut String) { percent_encode_to(input.as_bytes(), encode_set, output) } @@ -94,7 +158,7 @@ pub fn utf8_percent_encode_to(input: &str, encode_set: EncodeSet, output: &mut S /// /// The returned string is within the ASCII range. #[inline] -pub fn utf8_percent_encode(input: &str, encode_set: EncodeSet) -> String { +pub fn utf8_percent_encode(input: &str, encode_set: E) -> String { let mut output = String::new(); utf8_percent_encode_to(input, encode_set, &mut output); output From 87ee4340d2b1650777a5e55572b2ca084eefa2f9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 4 Dec 2015 22:26:51 +0100 Subject: [PATCH 04/89] Define encode sets based on another set. --- src/percent_encoding.rs | 42 +++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 79c17d0d..23c668b6 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -23,6 +23,8 @@ use std::fmt::Write; /// A few sets are defined in this module. /// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. pub trait EncodeSet { + /// Called with UTF-8 bytes rather than code points. + /// Should return false for all non-ASCII bytes. fn contains(&self, byte: u8) -> bool; } @@ -31,8 +33,7 @@ pub trait EncodeSet { /// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html) /// and related functions. /// -/// Parameters are ASCII printable characters to include in the set -/// in addition to U+0000 to U+001F and above U+007F. +/// Parameters are characters to include in the set in addition to those of the base set. /// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set). /// /// Example @@ -40,68 +41,77 @@ pub trait EncodeSet { /// /// ```rust /// #[macro_use] extern crate url; +/// use url::percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET}; /// define_encode_set! { /// /// This encode set is used in the URL parser for query strings. -/// pub QUERY_ENCODE_SET = {' ', '"', '#', '<', '>'} +/// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} /// } /// # fn main() { -/// assert_eq!(url::percent_encoding::percent_encode(b"foo bar", QUERY_ENCODE_SET), "foo%20bar"); +/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET), "foo%20bar"); /// # } /// ``` #[macro_export] macro_rules! define_encode_set { - ($(#[$attr: meta])* pub $name: ident = {$($ch: pat),*}) => { + ($(#[$attr: meta])* pub $name: ident = [$base_set: expr] | {$($ch: pat),*}) => { $(#[$attr])* #[derive(Copy, Clone)] #[allow(non_camel_case_types)] pub struct $name; impl $crate::percent_encoding::EncodeSet for $name { + #[inline] fn contains(&self, byte: u8) -> bool { match byte as char { $( $ch => true, )* - _ => byte < 0x20 || byte > 0x7E + _ => $base_set.contains(byte) } } } } } -define_encode_set! { - /// This encode set is used for fragment identifier and non-relative scheme data. - pub SIMPLE_ENCODE_SET = {} +/// This encode set is used for fragment identifier and non-relative scheme data. +#[derive(Copy, Clone)] +#[allow(non_camel_case_types)] +pub struct SIMPLE_ENCODE_SET; + +impl EncodeSet for SIMPLE_ENCODE_SET { + #[inline] + fn contains(&self, byte: u8) -> bool { + byte < 0x20 || byte > 0x7E + } } define_encode_set! { /// This encode set is used in the URL parser for query strings. - pub QUERY_ENCODE_SET = {' ', '"', '#', '<', '>'} + pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} } define_encode_set! { /// This encode set is used for path components. - pub DEFAULT_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}'} + pub DEFAULT_ENCODE_SET = [QUERY_ENCODE_SET] | {'`', '?', '{', '}'} } define_encode_set! { /// This encode set is used in the URL parser for usernames and passwords. - pub USERINFO_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@'} + pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'@'} } define_encode_set! { /// This encode set should be used when setting the password field of a parsed URL. - pub PASSWORD_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@', '\\', '/'} + pub PASSWORD_ENCODE_SET = [USERINFO_ENCODE_SET] | {'\\', '/'} } define_encode_set! { /// This encode set should be used when setting the username field of a parsed URL. - pub USERNAME_ENCODE_SET = {' ', '"', '#', '<', '>', '`', '?', '{', '}', '@', '\\', '/', ':'} + pub USERNAME_ENCODE_SET = [PASSWORD_ENCODE_SET] | {':'} } define_encode_set! { /// This encode set is used in `application/x-www-form-urlencoded` serialization. - pub FORM_URLENCODED_ENCODE_SET = { + pub FORM_URLENCODED_ENCODE_SET = [SIMPLE_ENCODE_SET] | { ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '+', ',', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~' } @@ -110,7 +120,7 @@ define_encode_set! { define_encode_set! { /// This encode set is used for HTTP header values and is defined at /// https://tools.ietf.org/html/rfc5987#section-3.2 - pub HTTP_VALUE = { + pub HTTP_VALUE = [SIMPLE_ENCODE_SET] | { ' ', '"', '%', '\'', '(', ')', '*', ',', '/', ':', ';', '<', '-', '>', '?', '[', '\\', ']', '{', '}' } From 4ec32fad790b36cdc89dd0d5cea81ed6c24718e9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 4 Dec 2015 21:59:42 +0100 Subject: [PATCH 05/89] Remove the HTTP_VALUE encode set. It can be defined in another crate. See the define_encode_set! macro. --- Cargo.toml | 2 +- src/percent_encoding.rs | 9 --------- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9d965381..42346763 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "url" -version = "0.5.9" +version = "1.0.0-dev" authors = [ "Simon Sapin " ] description = "URL library for Rust, based on the WHATWG URL Standard" diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 23c668b6..9ebcfe3e 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -117,15 +117,6 @@ define_encode_set! { } } -define_encode_set! { - /// This encode set is used for HTTP header values and is defined at - /// https://tools.ietf.org/html/rfc5987#section-3.2 - pub HTTP_VALUE = [SIMPLE_ENCODE_SET] | { - ' ', '"', '%', '\'', '(', ')', '*', ',', '/', ':', ';', '<', '-', '>', '?', - '[', '\\', ']', '{', '}' - } -} - /// Percent-encode the given bytes, and push the result to `output`. /// /// The pushed strings are within the ASCII range. From 9e759f18726c8e1343162922b87163d4dd08fe3c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 9 Dec 2015 17:26:24 -0500 Subject: [PATCH 06/89] Rewrite ALL THE THINGS! This changes the data structure for `Url`: Rather than having multiple `String` (or `Vec`) components, this uses a single `String` that contains the serialization of an URL and some indices into it to access components in O(1) time. This saves on memory allocations and makes serialization and some other methods very cheap, as they return `&str` rather than building a new `String`. As a consequence, most of `src/lib.rs` and `src/parser.rs` had to be rewritten. Fixes #142. --- Cargo.toml | 2 - src/encoding.rs | 2 + src/format.rs | 81 --- src/host.rs | 102 +-- src/lib.rs | 1087 +++++++++--------------------- src/parser.rs | 1379 ++++++++++++++++++++++++--------------- src/percent_encoding.rs | 15 +- src/urlutils.rs | 169 ----- tests/format.rs | 67 -- tests/tests.rs | 139 ++-- tests/urltestdata.txt | 26 +- tests/wpt.rs | 124 ++-- 12 files changed, 1396 insertions(+), 1797 deletions(-) delete mode 100644 src/format.rs delete mode 100644 src/urlutils.rs delete mode 100644 tests/format.rs diff --git a/Cargo.toml b/Cargo.toml index 42346763..33e23fe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,8 +11,6 @@ readme = "README.md" keywords = ["url", "parser"] license = "MIT/Apache-2.0" -[[test]] -name = "format" [[test]] name = "form_urlencoded" [[test]] diff --git a/src/encoding.rs b/src/encoding.rs index 5cdd71d3..be53ea19 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -37,6 +37,7 @@ impl EncodingOverride { } } + #[inline] pub fn utf8() -> EncodingOverride { EncodingOverride { encoding: None } } @@ -75,6 +76,7 @@ pub struct EncodingOverride; #[cfg(not(feature = "query_encoding"))] impl EncodingOverride { + #[inline] pub fn utf8() -> EncodingOverride { EncodingOverride } diff --git a/src/format.rs b/src/format.rs deleted file mode 100644 index ad656056..00000000 --- a/src/format.rs +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2013-2015 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Formatting utilities for URLs. -//! -//! These formatters can be used to coerce various URL parts into strings. -//! -//! You can use `.to_string()`, as the formatters implement `fmt::Display`. - -use std::fmt::{self, Formatter}; -use super::Url; - -/// Formatter and serializer for URL path data. -pub struct PathFormatter<'a, T:'a> { - /// The path as a slice of string-like objects (String or &str). - pub path: &'a [T] -} - -impl<'a, T: fmt::Display> fmt::Display for PathFormatter<'a, T> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - if self.path.is_empty() { - formatter.write_str("/") - } else { - for path_part in self.path { - try!("/".fmt(formatter)); - try!(path_part.fmt(formatter)); - } - Ok(()) - } - } -} - - -/// Formatter and serializer for URL username and password data. -pub struct UserInfoFormatter<'a> { - /// URL username as a string slice. - pub username: &'a str, - - /// URL password as an optional string slice. - /// - /// You can convert an `Option` with `.as_ref().map(|s| s)`. - pub password: Option<&'a str> -} - -impl<'a> fmt::Display for UserInfoFormatter<'a> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - if !self.username.is_empty() || self.password.is_some() { - try!(formatter.write_str(self.username)); - if let Some(password) = self.password { - try!(formatter.write_str(":")); - try!(formatter.write_str(password)); - } - try!(formatter.write_str("@")); - } - Ok(()) - } -} - - -/// Formatter for URLs which ignores the fragment field. -pub struct UrlNoFragmentFormatter<'a> { - pub url: &'a Url -} - -impl<'a> fmt::Display for UrlNoFragmentFormatter<'a> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - try!(formatter.write_str(&self.url.scheme)); - try!(formatter.write_str(":")); - try!(self.url.scheme_data.fmt(formatter)); - if let Some(ref query) = self.url.query { - try!(formatter.write_str("?")); - try!(formatter.write_str(query)); - } - Ok(()) - } -} diff --git a/src/host.rs b/src/host.rs index 06ac7818..a1b1f2af 100644 --- a/src/host.rs +++ b/src/host.rs @@ -6,39 +6,58 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::ascii::AsciiExt; use std::cmp; -use std::fmt::{self, Formatter}; +use std::fmt::{self, Formatter, Write}; use std::net::{Ipv4Addr, Ipv6Addr}; use parser::{ParseResult, ParseError}; use percent_encoding::{from_hex, percent_decode}; use idna; +#[derive(Copy, Clone, Debug)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +pub enum HostInternal { + None, + Domain, + Ipv4(Ipv4Addr), + Ipv6(Ipv6Addr), +} /// The host name of an URL. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum Host { - /// A (DNS) domain name. - Domain(String), - /// A IPv4 address, represented by four sequences of up to three ASCII digits. +pub enum Host { + /// A DNS domain name, as '.' dot-separated labels. + /// Non-ASCII labels are encoded in punycode per IDNA. + Domain(S), + + /// An IPv4 address. + /// `Url::host_str` returns the serialization of this address, + /// as four decimal integers separated by `.` dots. Ipv4(Ipv4Addr), - /// An IPv6 address, represented inside `[...]` square brackets - /// so that `:` colon characters in the address are not ambiguous - /// with the port number delimiter. + + /// An IPv6 address. + /// `Url::host_str` returns the serialization of that address between `[` and `]` brackets, + /// in the format per [RFC 5952 *A Recommendation + /// for IPv6 Address Text Representation*](https://tools.ietf.org/html/rfc5952): + /// lowercase hexadecimal with maximal `::` compression. Ipv6(Ipv6Addr), } +impl<'a> Host<&'a str> { + pub fn to_owned(&self) -> Host { + match *self { + Host::Domain(domain) => Host::Domain(domain.to_owned()), + Host::Ipv4(address) => Host::Ipv4(address), + Host::Ipv6(address) => Host::Ipv6(address), + } + } +} -impl Host { +impl Host { /// Parse a host: either an IPv6 address in [] square brackets, or a domain. /// - /// Returns `Err` for an empty host, an invalid IPv6 address, - /// or a or invalid non-ASCII domain. - pub fn parse(input: &str) -> ParseResult { - if input.len() == 0 { - return Err(ParseError::EmptyHost) - } + /// https://url.spec.whatwg.org/#host-parsing + pub fn parse(input: &str) -> Result { if input.starts_with("[") { if !input.ends_with("]") { return Err(ParseError::InvalidIpv6Address) @@ -47,37 +66,24 @@ impl Host { } let decoded = percent_decode(input.as_bytes()); let domain = String::from_utf8_lossy(&decoded); - - let domain = match idna::domain_to_ascii(&domain) { - Ok(s) => s, - Err(_) => return Err(ParseError::InvalidDomainCharacter) - }; - - if domain.find(&[ - '\0', '\t', '\n', '\r', ' ', '#', '%', '/', ':', '?', '@', '[', '\\', ']' - ][..]).is_some() { + let domain = try!(idna::domain_to_ascii(&domain)); + if domain.find(|c| matches!(c, + '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']' + )).is_some() { return Err(ParseError::InvalidDomainCharacter) } - match parse_ipv4addr(&domain[..]) { - Ok(Some(ipv4addr)) => Ok(Host::Ipv4(ipv4addr)), - Ok(None) => Ok(Host::Domain(domain.to_ascii_lowercase())), - Err(e) => Err(e), + if let Some(address) = try!(parse_ipv4addr(&domain)) { + Ok(Host::Ipv4(address)) + } else { + Ok(Host::Domain(domain.into())) } } - - /// Serialize the host as a string. - /// - /// A domain a returned as-is, an IPv6 address between [] square brackets. - pub fn serialize(&self) -> String { - self.to_string() - } } - -impl fmt::Display for Host { +impl> fmt::Display for Host { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match *self { - Host::Domain(ref domain) => domain.fmt(f), + Host::Domain(ref domain) => domain.as_ref().fmt(f), Host::Ipv4(ref addr) => addr.fmt(f), Host::Ipv6(ref addr) => { try!(f.write_str("[")); @@ -88,6 +94,19 @@ impl fmt::Display for Host { } } +/// Parse `input` as a host. +/// If successful, write its serialization to `serialization` +/// and return the internal representation for `Url`. +pub fn parse(input: &str, serialization: &mut String) -> ParseResult { + let host = try!(Host::parse(input)); + write!(serialization, "{}", host).unwrap(); + match host { + Host::Domain(_) => Ok(HostInternal::Domain), + Host::Ipv4(address) => Ok(HostInternal::Ipv4(address)), + Host::Ipv6(address) => Ok(HostInternal::Ipv6(address)), + } +} + fn write_ipv6(addr: &Ipv6Addr, f: &mut Formatter) -> fmt::Result { let segments = addr.segments(); let (compress_start, compress_end) = longest_zero_sequence(&segments); @@ -165,6 +184,9 @@ fn parse_ipv4number(mut input: &str) -> ParseResult { } fn parse_ipv4addr(input: &str) -> ParseResult> { + if input.is_empty() { + return Ok(None) + } let mut parts: Vec<&str> = input.split('.').collect(); if parts.last() == Some(&"") { parts.pop(); diff --git a/src/lib.rs b/src/lib.rs index dcc3e9ed..725100f6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -50,42 +50,42 @@ assert!(Url::parse("http://[:::1]") == Err(ParseError::InvalidIpv6Address)) Let’s parse a valid URL and look at its components. ``` -use url::{Url, SchemeData}; +use url::{Url, Host}; let issue_list_url = Url::parse( "https://github.com/rust-lang/rust/issues?labels=E-easy&state=open" ).unwrap(); -assert!(issue_list_url.scheme == "https".to_string()); -assert!(issue_list_url.domain() == Some("github.com")); +assert!(issue_list_url.scheme() == "https"); +assert!(issue_list_url.username() == ""); +assert!(issue_list_url.password() == None); +assert!(issue_list_url.host_str() == Some("github.com")); +assert!(issue_list_url.host() == Some(Host::Domain("github.com"))); assert!(issue_list_url.port() == None); -assert!(issue_list_url.path() == Some(&["rust-lang".to_string(), - "rust".to_string(), - "issues".to_string()][..])); -assert!(issue_list_url.query == Some("labels=E-easy&state=open".to_string())); -assert!(issue_list_url.fragment == None); -match issue_list_url.scheme_data { - SchemeData::Relative(..) => {}, // Expected - SchemeData::NonRelative(..) => panic!(), -} +assert!(issue_list_url.path() == "/rust-lang/rust/issues"); +assert!(issue_list_url.path_segments().map(|c| c.collect::>()) == + Some(vec!["rust-lang", "rust", "issues"])); +assert!(issue_list_url.query() == Some("labels=E-easy&state=open")); +assert!(issue_list_url.fragment() == None); +assert!(!issue_list_url.non_relative()); ``` -The `scheme`, `query`, and `fragment` are directly fields of the `Url` struct: -they apply to all URLs. -Every other components has accessors because they only apply to URLs said to be -“in a relative scheme”. `https` is a relative scheme, but `data` is not: +Some URLs are said to be "non-relative": +they don’t have a username, password, host, or port, +and their "path" is an arbitrary string rather than slash-separated segments: ``` -use url::{Url, SchemeData}; +use url::Url; -let data_url = Url::parse("data:text/plain,Hello#").unwrap(); +let data_url = Url::parse("data:text/plain,Hello?World#").unwrap(); -assert!(data_url.scheme == "data".to_string()); -assert!(data_url.scheme_data == SchemeData::NonRelative("text/plain,Hello".to_string())); -assert!(data_url.non_relative_scheme_data() == Some("text/plain,Hello")); -assert!(data_url.query == None); -assert!(data_url.fragment == Some("".to_string())); +assert!(data_url.non_relative()); +assert!(data_url.scheme() == "data"); +assert!(data_url.path() == "text/plain,Hello"); +assert!(data_url.path_segments().is_none()); +assert!(data_url.query() == Some("World")); +assert!(data_url.fragment() == Some("")); ``` @@ -97,7 +97,7 @@ Many contexts allow URL *references* that can be relative to a *base URL*: ``` -Since parsed URL are absolute, giving a base is required: +Since parsed URL are absolute, giving a base is required for parsing relative URLs: ``` use url::{Url, ParseError}; @@ -105,102 +105,64 @@ use url::{Url, ParseError}; assert!(Url::parse("../main.css") == Err(ParseError::RelativeUrlWithoutBase)) ``` -`UrlParser` is a method-chaining API to provide various optional parameters -to URL parsing, including a base URL. - -``` -use url::{Url, UrlParser}; - -let this_document = Url::parse("http://servo.github.io/rust-url/url/index.html").unwrap(); -let css_url = UrlParser::new().base_url(&this_document).parse("../main.css").unwrap(); -assert!(css_url.serialize() == "http://servo.github.io/rust-url/main.css".to_string()); -``` - -For convenience, the `join` method on `Url` is also provided to achieve the same result: +Use the `join` method on an `Url` to use it as a base URL: ``` use url::Url; let this_document = Url::parse("http://servo.github.io/rust-url/url/index.html").unwrap(); let css_url = this_document.join("../main.css").unwrap(); -assert!(&*css_url.serialize() == "http://servo.github.io/rust-url/main.css") +assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") */ #![cfg_attr(feature="heap_size", feature(plugin, custom_derive))] #![cfg_attr(feature="heap_size", plugin(heapsize_plugin))] extern crate rustc_serialize; -extern crate uuid; - -#[macro_use] -extern crate matches; - -#[cfg(feature="serde_serialization")] -extern crate serde; - -#[cfg(feature="heap_size")] -#[macro_use] extern crate heapsize; +#[macro_use] extern crate matches; +#[cfg(feature="serde_serialization")] extern crate serde; +#[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; extern crate idna; +extern crate uuid; -use std::fmt::{self, Formatter}; -use std::str; +use host::HostInternal; +use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode_to}; +use std::cmp; +use std::fmt; +use std::hash; +use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; -use std::borrow::Borrow; -use std::hash::{Hash, Hasher}; -use std::cmp::Ordering; - -#[cfg(feature="serde_serialization")] -use std::str::FromStr; +use std::str; +use uuid::Uuid; +pub use encoding::EncodingOverride; +pub use parser::ParseError; pub use host::Host; -pub use parser::{ErrorHandler, ParseResult, ParseError}; - -use percent_encoding::{percent_encode, lossy_utf8_percent_decode, DEFAULT_ENCODE_SET}; - -use format::{PathFormatter, UserInfoFormatter, UrlNoFragmentFormatter}; -use encoding::EncodingOverride; - -use uuid::Uuid; mod encoding; mod host; mod parser; -pub mod urlutils; pub mod percent_encoding; pub mod form_urlencoded; -pub mod format; -/// The parsed representation of an absolute URL. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] +/// A parsed URL record. +#[derive(Clone)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub struct Url { - /// The scheme (a.k.a. protocol) of the URL, in ASCII lower case. - pub scheme: String, - - /// The components of the URL whose representation depends on where the scheme is *relative*. - pub scheme_data: SchemeData, - - /// The query string of the URL. - /// - /// `None` if the `?` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `query_pairs`, `set_query_from_pairs`, - /// and `lossy_percent_decode_query` methods. - pub query: Option, + serialization: String, + non_relative: bool, - /// The fragment identifier of the URL. - /// - /// `None` if the `#` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `lossy_percent_decode_fragment` method. - pub fragment: Option, + // Components + scheme_end: u32, // Before ':' + username_end: u32, // Before ':' (if a password is given) or '@' (if not) + host_start: u32, + host_end: u32, + host: HostInternal, + port: Option, + path_start: u32, // Before initial '/' if !non_relative + query_start: Option, // Before '?', unlike Position::QueryStart + fragment_start: Option, // Before '#', unlike Position::FragmentStart } /// Opaque identifier for URLs that have file or other schemes @@ -225,391 +187,232 @@ pub enum Origin { UID(OpaqueOrigin), /// Consists of the URL's scheme, host and port - Tuple(String, Host, u16) -} - -/// The components of the URL whose representation depends on where the scheme is *relative*. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum SchemeData { - /// Components for URLs in a *relative* scheme such as HTTP. - Relative(RelativeSchemeData), - - /// No further structure is assumed for *non-relative* schemes such as `data` and `mailto`. - /// - /// This is a single percent-encoded string, whose interpretation depends on the scheme. - /// - /// Percent encoded strings are within the ASCII range. - NonRelative(String), -} - -/// Components for URLs in a *relative* scheme such as HTTP. -#[derive(Clone, Debug)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub struct RelativeSchemeData { - /// The username of the URL, as a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `lossy_percent_decode_username` method. - pub username: String, - - /// The password of the URL. - /// - /// `None` if the `:` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `lossy_percent_decode_password` method. - pub password: Option, - - /// The host of the URL, either a domain name or an IPv4 address - pub host: Host, - - /// The port number of the URL. - /// `None` for file-like schemes, or to indicate the default port number. - pub port: Option, - - /// The default port number for the URL’s scheme. - /// `None` for file-like schemes. - pub default_port: Option, - - /// The path of the URL, as vector of percent-encoded strings. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `serialize_path` method and, - /// for URLs in the `file` scheme, the `to_file_path` method. - pub path: Vec, + Tuple(String, Host, u16) } -impl RelativeSchemeData { - fn get_identity_key(&self) -> (&String, &Option, &Host, Option, Option, &Vec) { - ( - &self.username, - &self.password, - &self.host, - self.port.or(self.default_port), - self.default_port, - &self.path - ) +impl Url { + /// Parse an absolute URL from a string. + #[inline] + pub fn parse(input: &str) -> Result { + Url::parse_with(input, None, EncodingOverride::utf8(), None) } -} - -impl PartialEq for RelativeSchemeData { - fn eq(&self, other: &RelativeSchemeData) -> bool { - self.get_identity_key() == other.get_identity_key() + /// Parse a string as an URL, with this URL as the base URL. + #[inline] + pub fn join(&self, input: &str) -> Result { + Url::parse_with(input, Some(self), EncodingOverride::utf8(), None) } -} -impl Eq for RelativeSchemeData {} - -impl Hash for RelativeSchemeData { - fn hash(&self, state: &mut H) { - self.get_identity_key().hash(state) + /// The URL parser with all of its parameters. + /// + /// `encoding_override` is a legacy concept only relevant for HTML. + /// When it’s not needed, + /// `s.parse::()`, `Url::from_str(s)` and `url.join(s)` can be used instead. + pub fn parse_with(input: &str, + base_url: Option<&Url>, + encoding_override: EncodingOverride, + log_syntax_violation: Option<&Fn(&'static str)>) + -> Result { + parser::Parser { + serialization: String::with_capacity(input.len()), + base_url: base_url, + query_encoding_override: encoding_override, + log_syntax_violation: log_syntax_violation, + }.parse_url(input) } -} -impl PartialOrd for RelativeSchemeData { - fn partial_cmp(&self, other: &RelativeSchemeData) -> Option { - self.get_identity_key().partial_cmp(&other.get_identity_key()) + #[inline] + pub fn as_str(&self) -> &str { + &self.serialization } -} -impl Ord for RelativeSchemeData { - fn cmp(&self, other: &Self) -> Ordering { - self.get_identity_key().cmp(&other.get_identity_key()) + /// Return the scheme of this URL, as an ASCII string without the ':' delimiter. + #[inline] + pub fn scheme(&self) -> &str { + self.slice(..self.scheme_end) } -} -impl str::FromStr for Url { - type Err = ParseError; - - fn from_str(url: &str) -> ParseResult { - Url::parse(url) + /// Return whether this URL is non-relative (typical of e.g. `data:` and `mailto:` URLs.) + #[inline] + pub fn non_relative(&self) -> bool { + self.non_relative } -} -/// A set of optional parameters for URL parsing. -pub struct UrlParser<'a> { - base_url: Option<&'a Url>, - query_encoding_override: EncodingOverride, - error_handler: ErrorHandler, - scheme_type_mapper: fn(scheme: &str) -> SchemeType, -} - - -/// A method-chaining API to provide a set of optional parameters for URL parsing. -impl<'a> UrlParser<'a> { - /// Return a new UrlParser with default parameters. - #[inline] - pub fn new() -> UrlParser<'a> { - fn silent_handler(_reason: ParseError) -> ParseResult<()> { Ok(()) } - UrlParser { - base_url: None, - query_encoding_override: EncodingOverride::utf8(), - error_handler: silent_handler, - scheme_type_mapper: whatwg_scheme_type_mapper, + /// Return the username for this URL (typically the empty string) + /// as a percent-encoded ASCII string. + pub fn username(&self) -> &str { + if self.slice(self.scheme_end..).starts_with("://") { + self.slice(self.scheme_end + 3..self.username_end) + } else { + "" } } - /// Set the base URL used for resolving relative URL references, and return the `UrlParser`. - /// The default is no base URL, so that relative URLs references fail to parse. - #[inline] - pub fn base_url<'b>(&'b mut self, value: &'a Url) -> &'b mut UrlParser<'a> { - self.base_url = Some(value); - self + /// Return the password for this URL, if any, as a percent-encoded ASCII string. + pub fn password(&self) -> Option<&str> { + if self.byte_at(self.username_end) == b':' { + debug_assert!(self.has_host()); + debug_assert!(self.byte_at(self.host_start - 1) == b'@'); + Some(self.slice(self.username_end + 1..self.host_start - 1)) + } else { + None + } } - /// Set the character encoding the query string is encoded as before percent-encoding, - /// and return the `UrlParser`. - /// - /// This legacy quirk is only relevant to HTML. + /// Return whether this URL has a host. /// - /// This method is only available if the `query_encoding` Cargo feature is enabled. - #[cfg(feature = "query_encoding")] + /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs don’ #[inline] - pub fn query_encoding_override<'b>(&'b mut self, value: encoding::EncodingRef) - -> &'b mut UrlParser<'a> { - self.query_encoding_override = EncodingOverride::from_encoding(value); - self + pub fn has_host(&self) -> bool { + !matches!(self.host, HostInternal::None) } - /// Set an error handler for non-fatal parse errors, and return the `UrlParser`. + /// Return the string representation of the host (domain or IP address) for this URL, if any. + /// Non-ASCII domains are punycode-encoded per IDNA. /// - /// Non-fatal parse errors are normally ignored by the parser, - /// but indicate violations of authoring requirements. - /// An error handler can be used, for example, to log these errors in the console - /// of a browser’s developer tools. + /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// don’t have a host. /// - /// The error handler can choose to make the error fatal by returning `Err(..)` - #[inline] - pub fn error_handler<'b>(&'b mut self, value: ErrorHandler) -> &'b mut UrlParser<'a> { - self.error_handler = value; - self + /// See also the `host` method. + pub fn host_str(&self) -> Option<&str> { + if self.has_host() { + Some(self.slice(self.host_start..self.host_end)) + } else { + None + } } - /// Set a *scheme type mapper*, and return the `UrlParser`. + /// Return the parsed representation of the host for this URL. + /// Non-ASCII domain labels are punycode-encoded per IDNA. /// - /// The URL parser behaves differently based on the `SchemeType` of the URL. - /// See the documentation for `SchemeType` for more details. - /// A *scheme type mapper* returns a `SchemeType` - /// based on the scheme as an ASCII lower case string, - /// as found in the `scheme` field of an `Url` struct. + /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// don’t have a host. /// - /// The default scheme type mapper is as follows: - /// - /// ``` - /// # use url::SchemeType; - /// fn whatwg_scheme_type_mapper(scheme: &str) -> SchemeType { - /// match scheme { - /// "file" => SchemeType::FileLike, - /// "ftp" => SchemeType::Relative(21), - /// "gopher" => SchemeType::Relative(70), - /// "http" => SchemeType::Relative(80), - /// "https" => SchemeType::Relative(443), - /// "ws" => SchemeType::Relative(80), - /// "wss" => SchemeType::Relative(443), - /// _ => SchemeType::NonRelative, - /// } - /// } - /// ``` - /// - /// Note that unknown schemes default to non-relative. - /// Overriding the scheme type mapper can allow, for example, - /// parsing URLs in the `git` or `irc` scheme as relative. - #[inline] - pub fn scheme_type_mapper<'b>(&'b mut self, value: fn(scheme: &str) -> SchemeType) - -> &'b mut UrlParser<'a> { - self.scheme_type_mapper = value; - self + /// See also the `host_str` method. + pub fn host(&self) -> Option> { + match self.host { + HostInternal::None => None, + HostInternal::Domain => Some(Host::Domain(self.slice(self.host_start..self.host_end))), + HostInternal::Ipv4(address) => Some(Host::Ipv4(address)), + HostInternal::Ipv6(address) => Some(Host::Ipv6(address)), + } } - /// Parse `input` as an URL, with all the parameters previously set in the `UrlParser`. + /// Return the port number for this URL, if any. #[inline] - pub fn parse(&self, input: &str) -> ParseResult { - parser::parse_url(input, self) + pub fn port(&self) -> Option { + self.port } - /// Parse `input` as a “standalone” URL path, - /// with an optional query string and fragment identifier. - /// - /// This is typically found in the start line of an HTTP header. + /// Return the port number for this URL, or the default port number if it is known. /// - /// Note that while the start line has no fragment identifier in the HTTP RFC, - /// servers typically parse it and ignore it - /// (rather than having it be part of the path or query string.) + /// This method only knows the default port number + /// of the `http`, `https`, `ws`, `wss`, `ftp`, and `gopher` schemes. /// - /// On success, return `(path, query_string, fragment_identifier)` + /// For URLs in these schemes, this method always returns `Some(_)`. + /// For other schemes, it is the same as `Url::port()`. #[inline] - pub fn parse_path(&self, input: &str) - -> ParseResult<(Vec, Option, Option)> { - parser::parse_standalone_path(input, self) - } -} - - -/// Parse `input` as a “standalone” URL path, -/// with an optional query string and fragment identifier. -/// -/// This is typically found in the start line of an HTTP header. -/// -/// Note that while the start line has no fragment identifier in the HTTP RFC, -/// servers typically parse it and ignore it -/// (rather than having it be part of the path or query string.) -/// -/// On success, return `(path, query_string, fragment_identifier)` -/// -/// ```rust -/// let (path, query, fragment) = url::parse_path("/foo/bar/../baz?q=42").unwrap(); -/// assert_eq!(path, vec!["foo".to_string(), "baz".to_string()]); -/// assert_eq!(query, Some("q=42".to_string())); -/// assert_eq!(fragment, None); -/// ``` -/// -/// The query string returned by `url::parse_path` can be decoded with -/// `url::form_urlencoded::parse`. -#[inline] -pub fn parse_path(input: &str) - -> ParseResult<(Vec, Option, Option)> { - UrlParser::new().parse_path(input) -} - - -/// Private convenience methods for use in parser.rs -impl<'a> UrlParser<'a> { - #[inline] - fn parse_error(&self, error: ParseError) -> ParseResult<()> { - (self.error_handler)(error) + pub fn port_or_default(&self) -> Option { + self.port.or_else(|| parser::default_port(self.scheme())) + } + + /// Return the path for this URL, as a percent-encoded ASCII string. + /// For relative URLs, this starts with a '/' slash + /// and continues with slash-separated path segments. + /// For non-relative URLs, this is an arbitrary string that doesn’t start with '/'. + pub fn path(&self) -> &str { + match (self.query_start, self.fragment_start) { + (None, None) => self.slice(self.path_start..), + (Some(next_component_start), _) | + (None, Some(next_component_start)) => { + self.slice(self.path_start..next_component_start) + } + } } - #[inline] - fn get_scheme_type(&self, scheme: &str) -> SchemeType { - (self.scheme_type_mapper)(scheme) - } -} - - -/// Determines the behavior of the URL parser for a given scheme. -#[derive(PartialEq, Eq, Copy, Debug, Clone, Hash, PartialOrd, Ord)] -pub enum SchemeType { - /// Indicate that the scheme is *non-relative*. - /// - /// The *scheme data* of the URL - /// (everything other than the scheme, query string, and fragment identifier) - /// is parsed as a single percent-encoded string of which no structure is assumed. - /// That string may need to be parsed further, per a scheme-specific format. - NonRelative, - - /// Indicate that the scheme is *relative*, and what the default port number is. - /// - /// The *scheme data* is structured as - /// *username*, *password*, *host*, *port number*, and *path*. - /// Relative URL references are supported, if a base URL was given. - /// The string value indicates the default port number as a string of ASCII digits, - /// or the empty string to indicate no default port number. - Relative(u16), - - /// Indicate a *relative* scheme similar to the *file* scheme. + /// If this URL is relative, return an iterator of '/' slash-separated path segments, + /// each as a percent-encoded ASCII string. /// - /// For example, you might want to have distinct `git+file` and `hg+file` URL schemes. - /// - /// This is like `Relative` except the host can be empty, there is no port number, - /// and path parsing has (platform-independent) quirks to support Windows filenames. - FileLike, -} - -impl SchemeType { - pub fn default_port(&self) -> Option { - match *self { - SchemeType::Relative(default_port) => Some(default_port), - _ => None, - } - } - pub fn same_as(&self, other: SchemeType) -> bool { - match (self, other) { - (&SchemeType::NonRelative, SchemeType::NonRelative) => true, - (&SchemeType::Relative(_), SchemeType::Relative(_)) => true, - (&SchemeType::FileLike, SchemeType::FileLike) => true, - _ => false + /// Return `None` for non-relative URLs, or an iterator of at least one string. + pub fn path_segments(&self) -> Option> { + if self.non_relative { + None + } else { + let path = self.path(); + debug_assert!(path.starts_with("/")); + Some(path[1..].split('/')) } } -} -/// http://url.spec.whatwg.org/#special-scheme -pub fn whatwg_scheme_type_mapper(scheme: &str) -> SchemeType { - match scheme { - "file" => SchemeType::FileLike, - "ftp" => SchemeType::Relative(21), - "gopher" => SchemeType::Relative(70), - "http" => SchemeType::Relative(80), - "https" => SchemeType::Relative(443), - "ws" => SchemeType::Relative(80), - "wss" => SchemeType::Relative(443), - _ => SchemeType::NonRelative, + /// Return this URL’s query string, if any, as a percent-encoded ASCII string. + pub fn query(&self) -> Option<&str> { + match (self.query_start, self.fragment_start) { + (None, _) => None, + (Some(query_start), None) => { + debug_assert!(self.byte_at(query_start) == b'?'); + Some(self.slice(query_start + 1..)) + } + (Some(query_start), Some(fragment_start)) => { + debug_assert!(self.byte_at(query_start) == b'?'); + Some(self.slice(query_start + 1..fragment_start)) + } + } } -} - -impl Url { - /// Parse an URL with the default `UrlParser` parameters. - /// - /// In particular, relative URL references are parse errors since no base URL is provided. - #[inline] - pub fn parse(input: &str) -> ParseResult { - UrlParser::new().parse(input) + /// Return this URL’s fragment identifier, if any, as a percent-encoded ASCII string. + pub fn fragment(&self) -> Option<&str> { + self.fragment_start.map(|start| { + debug_assert!(self.byte_at(start) == b'#'); + self.slice(start + 1..) + }) } /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// - /// This returns `Err` if the given path is not absolute - /// or, with a Windows path, if the prefix is not a disk prefix (e.g. `C:`). + /// This returns `Err` if the given path is not absolute or, + /// on Windows, if the prefix is not a disk prefix (e.g. `C:`). pub fn from_file_path>(path: P) -> Result { - let path = try!(path_to_file_url_path(path.as_ref())); - Ok(Url::from_path_common(path)) + let mut serialization = "file://".to_owned(); + let path_start = serialization.len() as u32; + try!(path_to_file_url_segments(path.as_ref(), &mut serialization)); + Ok(Url { + serialization: serialization, + non_relative: false, + scheme_end: "file".len() as u32, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: None, + }) } /// Convert a directory name as `std::path::Path` into an URL in the `file` scheme. /// - /// This returns `Err` if the given path is not absolute - /// or, with a Windows path, if the prefix is not a disk prefix (e.g. `C:`). + /// This returns `Err` if the given path is not absolute or, + /// on Windows, if the prefix is not a disk prefix (e.g. `C:`). /// - /// Compared to `from_file_path`, this adds an empty component to the path - /// (or, in terms of URL syntax, adds a trailing slash) + /// Compared to `from_file_path`, this ensure that URL’s the path has a trailing slash /// so that the entire path is considered when using this URL as a base URL. /// /// For example: /// /// * `"index.html"` parsed with `Url::from_directory_path(Path::new("/var/www"))` /// as the base URL is `file:///var/www/index.html` - /// * `"index.html"` parsed with `Url::from_file_path(Path::new("/var/www/"))` + /// * `"index.html"` parsed with `Url::from_file_path(Path::new("/var/www"))` /// as the base URL is `file:///var/index.html`, which might not be what was intended. /// - /// (Note that `Path::new` removes any trailing slash.) + /// Note that `std::path` does not consider trailing slashes significant + /// and usually does not include them (e.g. in `Path::parent()`). pub fn from_directory_path>(path: P) -> Result { - let mut path = try!(path_to_file_url_path(path.as_ref())); - // Add an empty path component (i.e. a trailing slash in serialization) - // so that the entire path is used as a base URL. - path.push("".to_owned()); - Ok(Url::from_path_common(path)) - } - - fn from_path_common(path: Vec) -> Url { - Url { - scheme: "file".to_owned(), - scheme_data: SchemeData::Relative(RelativeSchemeData { - username: "".to_owned(), - password: None, - port: None, - default_port: None, - host: Host::Domain("".to_owned()), - path: path, - }), - query: None, - fragment: None, + let mut url = try!(Url::from_file_path(path)); + if !url.serialization.ends_with('/') { + url.serialization.push('/') } + Ok(url) } /// Assuming the URL is in the `file` scheme or similar, @@ -631,29 +434,28 @@ impl Url { /// for a Windows path, is not UTF-8.) #[inline] pub fn to_file_path(&self) -> Result { - match self.scheme_data { - SchemeData::Relative(ref scheme_data) => scheme_data.to_file_path(), - SchemeData::NonRelative(..) => Err(()), + // FIXME: Figure out what to do w.r.t host. + if matches!(self.host(), None | Some(Host::Domain("localhost"))) { + if let Some(segments) = self.path_segments() { + return file_url_segments_to_pathbuf(segments) + } } - } - - /// Return the serialization of this URL as a string. - pub fn serialize(&self) -> String { - self.to_string() + Err(()) } /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) pub fn origin(&self) -> Origin { - match &*self.scheme { + let scheme = self.scheme(); + match scheme { "blob" => { - let result = Url::parse(self.non_relative_scheme_data().unwrap()); + let result = Url::parse(self.path()); match result { Ok(ref url) => url.origin(), Err(_) => Origin::UID(OpaqueOrigin::new()) } }, "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { - Origin::Tuple(self.scheme.clone(), self.host().unwrap().clone(), + Origin::Tuple(scheme.to_owned(), self.host().unwrap().to_owned(), self.port_or_default().unwrap()) }, // TODO: Figure out what to do if the scheme is a file @@ -662,220 +464,123 @@ impl Url { } } - /// Return the serialization of this URL, without the fragment identifier, as a string - pub fn serialize_no_fragment(&self) -> String { - UrlNoFragmentFormatter{ url: self }.to_string() - } - - /// If the URL is *non-relative*, return the string scheme data. - #[inline] - pub fn non_relative_scheme_data(&self) -> Option<&str> { - match self.scheme_data { - SchemeData::Relative(..) => None, - SchemeData::NonRelative(ref scheme_data) => Some(scheme_data), - } - } - - /// If the URL is *non-relative*, return a mutable reference to the string scheme data. - #[inline] - pub fn non_relative_scheme_data_mut(&mut self) -> Option<&mut String> { - match self.scheme_data { - SchemeData::Relative(..) => None, - SchemeData::NonRelative(ref mut scheme_data) => Some(scheme_data), - } - } - - /// If the URL is in a *relative scheme*, return the structured scheme data. - #[inline] - pub fn relative_scheme_data(&self) -> Option<&RelativeSchemeData> { - match self.scheme_data { - SchemeData::Relative(ref scheme_data) => Some(scheme_data), - SchemeData::NonRelative(..) => None, - } - } - - /// If the URL is in a *relative scheme*, - /// return a mutable reference to the structured scheme data. - #[inline] - pub fn relative_scheme_data_mut(&mut self) -> Option<&mut RelativeSchemeData> { - match self.scheme_data { - SchemeData::Relative(ref mut scheme_data) => Some(scheme_data), - SchemeData::NonRelative(..) => None, - } - } - - /// If the URL is in a *relative scheme*, return its username. - #[inline] - pub fn username(&self) -> Option<&str> { - self.relative_scheme_data().map(|scheme_data| &*scheme_data.username) - } - - /// If the URL is in a *relative scheme*, return a mutable reference to its username. - #[inline] - pub fn username_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.username) - } - - /// Percent-decode the URL’s username, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_username(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.lossy_percent_decode_username()) - } - - /// If the URL is in a *relative scheme*, return its password, if any. - #[inline] - pub fn password(&self) -> Option<&str> { - self.relative_scheme_data().and_then(|scheme_data| - scheme_data.password.as_ref().map(|password| password as &str)) - } - - /// If the URL is in a *relative scheme*, return a mutable reference to its password, if any. - #[inline] - pub fn password_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().and_then(|scheme_data| scheme_data.password.as_mut()) - } - - /// Percent-decode the URL’s password, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. + /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` + /// and return a vector of (key, value) pairs. #[inline] - pub fn lossy_percent_decode_password(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| - scheme_data.lossy_percent_decode_password()) + pub fn query_pairs(&self) -> Option> { + self.query().map(|query| form_urlencoded::parse(query.as_bytes())) } - /// Serialize the URL's username and password, if any. - /// - /// Format: ":@" - #[inline] - pub fn serialize_userinfo(&mut self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.serialize_userinfo()) - } + // Private helper methods: - /// If the URL is in a *relative scheme*, return its structured host. #[inline] - pub fn host(&self) -> Option<&Host> { - self.relative_scheme_data().map(|scheme_data| &scheme_data.host) + fn slice(&self, range: R) -> &str where R: RangeArg { + range.slice_of(&self.serialization) } - /// If the URL is in a *relative scheme*, return a mutable reference to its structured host. #[inline] - pub fn host_mut(&mut self) -> Option<&mut Host> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.host) + fn byte_at(&self, i: u32) -> u8 { + self.serialization.as_bytes()[i as usize] } +} - /// If the URL is in a *relative scheme* and its host is a domain, - /// return the domain as a string. - #[inline] - pub fn domain(&self) -> Option<&str> { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.domain()) - } +/// Parse a string as an URL, without a base URL or encoding override. +impl str::FromStr for Url { + type Err = ParseError; - /// If the URL is in a *relative scheme* and its host is a domain, - /// return a mutable reference to the domain string. #[inline] - pub fn domain_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().and_then(|scheme_data| scheme_data.domain_mut()) + fn from_str(input: &str) -> Result { + Url::parse(input) } +} - /// If the URL is in a *relative scheme*, serialize its host as a string. - /// - /// A domain a returned as-is, an IPv6 address between [] square brackets. +/// Display the serialization of this URL. +impl fmt::Display for Url { #[inline] - pub fn serialize_host(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.host.serialize()) + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.serialization, formatter) } +} - /// If the URL is in a *relative scheme* and has a port number, return it. +/// Debug the serialization of this URL. +impl fmt::Debug for Url { #[inline] - pub fn port(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.port) + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(&self.serialization, formatter) } +} - /// If the URL is in a *relative scheme*, return a mutable reference to its port. - #[inline] - pub fn port_mut(&mut self) -> Option<&mut Option> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.port) - } +/// URLs compare like their serialization. +impl Eq for Url {} - /// If the URL is in a *relative scheme* that is not a file-like, - /// return its port number, even if it is the default. +/// URLs compare like their serialization. +impl PartialEq for Url { #[inline] - pub fn port_or_default(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.port_or_default()) + fn eq(&self, other: &Self) -> bool { + self.serialization == other.serialization } +} - /// If the URL is in a *relative scheme*, return its path components. +/// URLs compare like their serialization. +impl Ord for Url { #[inline] - pub fn path(&self) -> Option<&[String]> { - self.relative_scheme_data().map(|scheme_data| &*scheme_data.path) + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.serialization.cmp(&other.serialization) } +} - /// If the URL is in a *relative scheme*, return a mutable reference to its path components. +/// URLs compare like their serialization. +impl PartialOrd for Url { #[inline] - pub fn path_mut(&mut self) -> Option<&mut Vec> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.path) + fn partial_cmp(&self, other: &Self) -> Option { + self.serialization.partial_cmp(&other.serialization) } +} - /// If the URL is in a *relative scheme*, serialize its path as a string. - /// - /// The returned string starts with a "/" slash, and components are separated by slashes. - /// A trailing slash represents an empty last component. +/// URLs hash like their serialization. +impl hash::Hash for Url { #[inline] - pub fn serialize_path(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.serialize_path()) + fn hash(&self, state: &mut H) where H: hash::Hasher { + hash::Hash::hash(&self.serialization, state) } +} - /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` - /// and return a vector of (key, value) pairs. +/// Return the serialization of this URL. +impl AsRef for Url { #[inline] - pub fn query_pairs(&self) -> Option> { - self.query.as_ref().map(|query| form_urlencoded::parse(query.as_bytes())) + fn as_ref(&self) -> &str { + &self.serialization } +} - /// Serialize an iterator of (key, value) pairs as `application/x-www-form-urlencoded` - /// and set it as the URL’s query string. - #[inline] - pub fn set_query_from_pairs(&mut self, pairs: I) - where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - self.query = Some(form_urlencoded::serialize(pairs)); - } +trait RangeArg { + fn slice_of<'a>(&self, s: &'a str) -> &'a str; +} - /// Percent-decode the URL’s query string, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. +impl RangeArg for Range { #[inline] - pub fn lossy_percent_decode_query(&self) -> Option { - self.query.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start as usize .. self.end as usize] } +} - /// Percent-decode the URL’s fragment identifier, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. +impl RangeArg for RangeFrom { #[inline] - pub fn lossy_percent_decode_fragment(&self) -> Option { - self.fragment.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start as usize ..] } +} - /// Join a path with a base URL. - /// - /// Corresponds to the basic URL parser where `self` is the given base URL. +impl RangeArg for RangeTo { #[inline] - pub fn join(&self, input: &str) -> ParseResult { - UrlParser::new().base_url(self).parse(input) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[.. self.end as usize] } } - impl rustc_serialize::Encodable for Url { fn encode(&self, encoder: &mut S) -> Result<(), S::Error> { - encoder.emit_str(&self.to_string()) + encoder.emit_str(self.as_str()) } } @@ -905,174 +610,32 @@ impl serde::Serialize for Url { impl serde::Deserialize for Url { fn deserialize(deserializer: &mut D) -> Result where D: serde::Deserializer { let string_representation: String = try!(serde::Deserialize::deserialize(deserializer)); - Ok(FromStr::from_str(&string_representation[..]).unwrap()) - } -} - -impl fmt::Display for Url { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - try!(UrlNoFragmentFormatter{ url: self }.fmt(formatter)); - if let Some(ref fragment) = self.fragment { - try!(formatter.write_str("#")); - try!(formatter.write_str(fragment)); - } - Ok(()) - } -} - - -impl fmt::Display for SchemeData { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - match *self { - SchemeData::Relative(ref scheme_data) => scheme_data.fmt(formatter), - SchemeData::NonRelative(ref scheme_data) => scheme_data.fmt(formatter), - } + Ok(Url::parse(&string_representation).unwrap()) } } - -impl RelativeSchemeData { - /// Percent-decode the URL’s username. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_username(&self) -> String { - lossy_utf8_percent_decode(self.username.as_bytes()) - } - - /// Percent-decode the URL’s password, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_password(&self) -> Option { - self.password.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) - } - - /// Assuming the URL is in the `file` scheme or similar, - /// convert its path to an absolute `std::path::Path`. - /// - /// **Note:** This does not actually check the URL’s `scheme`, - /// and may give nonsensical results for other schemes. - /// It is the user’s responsibility to check the URL’s scheme before calling this. - /// - /// ``` - /// # use url::Url; - /// # let url = Url::parse("file:///etc/passwd").unwrap(); - /// let path = url.to_file_path(); - /// ``` - /// - /// Returns `Err` if the host is neither empty nor `"localhost"`, - /// or if `Path::new_opt()` returns `None`. - /// (That is, if the percent-decoded path contains a NUL byte or, - /// for a Windows path, is not UTF-8.) - #[inline] - pub fn to_file_path(&self) -> Result { - // FIXME: Figure out what to do w.r.t host. - if !matches!(self.domain(), Some("") | Some("localhost")) { - return Err(()) - } - file_url_path_to_pathbuf(&self.path) - } - - /// If the host is a domain, return the domain as a string. - #[inline] - pub fn domain(&self) -> Option<&str> { - match self.host { - Host::Domain(ref domain) => Some(domain), - _ => None, - } - } - - /// If the host is a domain, return a mutable reference to the domain string. - #[inline] - pub fn domain_mut(&mut self) -> Option<&mut String> { - match self.host { - Host::Domain(ref mut domain) => Some(domain), - _ => None, - } - } - - /// Return the port number of the URL, even if it is the default. - /// Return `None` for file-like URLs. - #[inline] - pub fn port_or_default(&self) -> Option { - self.port.or(self.default_port) - } - - /// Serialize the path as a string. - /// - /// The returned string starts with a "/" slash, and components are separated by slashes. - /// A trailing slash represents an empty last component. - pub fn serialize_path(&self) -> String { - PathFormatter { - path: &self.path - }.to_string() - } - - /// Serialize the userinfo as a string. - /// - /// Format: ":@". - pub fn serialize_userinfo(&self) -> String { - UserInfoFormatter { - username: &self.username, - password: self.password.as_ref().map(|s| s as &str) - }.to_string() - } -} - - -impl fmt::Display for RelativeSchemeData { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - // Write the scheme-trailing double slashes. - try!(formatter.write_str("//")); - - // Write the user info. - try!(UserInfoFormatter { - username: &self.username, - password: self.password.as_ref().map(|s| s as &str) - }.fmt(formatter)); - - // Write the host. - try!(self.host.fmt(formatter)); - - // Write the port. - match self.port { - Some(port) => { - try!(write!(formatter, ":{}", port)); - }, - None => {} - } - - // Write the path. - PathFormatter { - path: &self.path - }.fmt(formatter) - } -} - - #[cfg(unix)] -fn path_to_file_url_path(path: &Path) -> Result, ()> { +fn path_to_file_url_segments(path: &Path, serialization: &mut String) -> Result<(), ()> { use std::os::unix::prelude::OsStrExt; if !path.is_absolute() { return Err(()) } // skip the root component - Ok(path.components().skip(1).map(|c| { - percent_encode(c.as_os_str().as_bytes(), DEFAULT_ENCODE_SET) - }).collect()) + for component in path.components().skip(1) { + serialization.push('/'); + percent_encode_to(component.as_os_str().as_bytes(), PATH_SEGMENT_ENCODE_SET, serialization) + } + Ok(()) } #[cfg(windows)] -fn path_to_file_url_path(path: &Path) -> Result, ()> { - path_to_file_url_path_windows(path) +fn path_to_file_url_segments(path: &Path, serialization: &mut String) -> Result<(), ()> { + path_to_file_url_segments_windows(path, serialization) } // Build this unconditionally to alleviate https://github.com/servo/rust-url/issues/102 #[cfg_attr(not(windows), allow(dead_code))] -fn path_to_file_url_path_windows(path: &Path) -> Result, ()> { +fn path_to_file_url_segments_windows(path: &Path, serialization: &mut String) -> Result<(), ()> { use std::path::{Prefix, Component}; if !path.is_absolute() { return Err(()) @@ -1090,35 +653,32 @@ fn path_to_file_url_path_windows(path: &Path) -> Result, ()> { }; // Start with the prefix, e.g. "C:" - let mut path = vec![format!("{}:", disk as char)]; + serialization.push('/'); + serialization.push(disk as char); + serialization.push(':'); for component in components { if component == Component::RootDir { continue } // FIXME: somehow work with non-unicode? - let part = match component.as_os_str().to_str() { - Some(s) => s, - None => return Err(()), - }; - path.push(percent_encode(part.as_bytes(), DEFAULT_ENCODE_SET)); + let component = try!(component.as_os_str().to_str().ok_or(())); + serialization.push('/'); + percent_encode_to(component.as_bytes(), PATH_SEGMENT_ENCODE_SET, serialization); } - Ok(path) + Ok(()) } #[cfg(unix)] -fn file_url_path_to_pathbuf(path: &[String]) -> Result { +fn file_url_segments_to_pathbuf(segments: str::Split) -> Result { use std::ffi::OsStr; use std::os::unix::prelude::OsStrExt; use std::path::PathBuf; use percent_encoding::percent_decode_to; - if path.is_empty() { - return Ok(PathBuf::from("/")) - } let mut bytes = Vec::new(); - for path_part in path { + for segment in segments { bytes.push(b'/'); - percent_decode_to(path_part.as_bytes(), &mut bytes); + percent_decode_to(segment.as_bytes(), &mut bytes); } let os_str = OsStr::from_bytes(&bytes); let path = PathBuf::from(os_str); @@ -1128,29 +688,26 @@ fn file_url_path_to_pathbuf(path: &[String]) -> Result { } #[cfg(windows)] -fn file_url_path_to_pathbuf(path: &[String]) -> Result { - file_url_path_to_pathbuf_windows(path) +fn file_url_segments_to_pathbuf(segments: str::Split) -> Result { + file_url_segments_to_pathbuf_windows(segments) } // Build this unconditionally to alleviate https://github.com/servo/rust-url/issues/102 #[cfg_attr(not(windows), allow(dead_code))] -fn file_url_path_to_pathbuf_windows(path: &[String]) -> Result { +fn file_url_segments_to_pathbuf_windows(mut segments: str::Split) -> Result { use percent_encoding::percent_decode; - if path.is_empty() { - return Err(()) - } - let prefix = &*path[0]; - if prefix.len() != 2 || !parser::starts_with_ascii_alpha(prefix) - || prefix.as_bytes()[1] != b':' { + let first = try!(segments.next().ok_or(())); + if first.len() != 2 || !first.starts_with(parser::ascii_alpha) + || first.as_bytes()[1] != b':' { return Err(()) } - let mut string = prefix.to_owned(); - for path_part in &path[1..] { + let mut string = first.to_owned(); + for segment in segments { string.push('\\'); // Currently non-unicode windows paths cannot be represented - match String::from_utf8(percent_decode(path_part.as_bytes())) { + match String::from_utf8(percent_decode(segment.as_bytes())) { Ok(s) => string.push_str(&s), Err(..) => return Err(()), } diff --git a/src/parser.rs b/src/parser.rs index ae8182dd..9e035bcd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -7,20 +7,18 @@ // except according to those terms. use std::ascii::AsciiExt; -use std::cmp::max; use std::error::Error; -use std::fmt::{self, Formatter}; +use std::fmt::{self, Formatter, Write}; -use super::{UrlParser, Url, SchemeData, RelativeSchemeData, Host, SchemeType}; +use super::{Url, EncodingOverride}; +use host::{self, HostInternal}; use percent_encoding::{ - utf8_percent_encode_to, percent_encode, + utf8_percent_encode_to, percent_encode_to, SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET }; - pub type ParseResult = Result; - macro_rules! simple_enum_error { ($($name: ident => $description: expr,)+) => { /// Errors that can occur during parsing. @@ -45,6 +43,7 @@ macro_rules! simple_enum_error { simple_enum_error! { EmptyHost => "empty host", + IdnaError => "invalid international domain name", InvalidScheme => "invalid scheme", InvalidPort => "invalid port number", InvalidIpv4Address => "invalid IPv4 address", @@ -69,6 +68,7 @@ simple_enum_error! { CannotSetHostWithNonRelativeScheme => "cannot set host with non-relative scheme", CannotSetPortWithNonRelativeScheme => "cannot set port with non-relative scheme", CannotSetPathWithNonRelativeScheme => "cannot set path with non-relative scheme", + Overflow => "URLs more than 4 GB are not supported", } impl fmt::Display for ParseError { @@ -77,14 +77,9 @@ impl fmt::Display for ParseError { } } -/// This is called on non-fatal parse errors. -/// -/// The handler can choose to continue or abort parsing by returning Ok() or Err(), respectively. -/// See the `UrlParser::error_handler` method. -/// -/// FIXME: make this a by-ref closure when that’s supported. -pub type ErrorHandler = fn(reason: ParseError) -> ParseResult<()>; - +impl From<::idna::uts46::Errors> for ParseError { + fn from(_: ::idna::uts46::Errors) -> ParseError { ParseError::IdnaError } +} #[derive(PartialEq, Eq)] pub enum Context { @@ -92,574 +87,910 @@ pub enum Context { Setter, } - -pub fn parse_url(input: &str, parser: &UrlParser) -> ParseResult { - let input = input.trim_matches(&[' ', '\t', '\n', '\r', '\x0C'][..]); - let (scheme, remaining) = match parse_scheme(input, Context::UrlParser) { - Some((scheme, remaining)) => (scheme, remaining), - // No-scheme state - None => return match parser.base_url { - Some(&Url { ref scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) => { - let scheme_type = parser.get_scheme_type(&scheme); - parse_relative_url(input, scheme.clone(), scheme_type, base, query, parser) - }, - Some(_) => Err(ParseError::RelativeUrlWithNonRelativeBase), - None => Err(ParseError::RelativeUrlWithoutBase), - }, - }; - let scheme_type = parser.get_scheme_type(&scheme); - match scheme_type { - SchemeType::FileLike => { - // Relative state? - match parser.base_url { - Some(&Url { scheme: ref base_scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) - if scheme == *base_scheme => { - parse_relative_url(remaining, scheme, scheme_type, base, query, parser) - }, - // FIXME: Should not have to use a made-up base URL. - _ => parse_relative_url(remaining, scheme, scheme_type, &RelativeSchemeData { - username: String::new(), password: None, host: Host::Domain(String::new()), - port: None, default_port: None, path: Vec::new() - }, &None, parser) - } - }, - SchemeType::Relative(..) => { - match parser.base_url { - Some(&Url { scheme: ref base_scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) - if scheme == *base_scheme && !remaining.starts_with("//") => { - try!(parser.parse_error(ParseError::RelativeUrlWithScheme)); - parse_relative_url(remaining, scheme, scheme_type, base, query, parser) - }, - _ => parse_absolute_url(scheme, scheme_type, remaining, parser), - } - }, - SchemeType::NonRelative => { - // Scheme data state - let (scheme_data, remaining) = try!(parse_scheme_data(remaining, parser)); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: SchemeData::NonRelative(scheme_data), - query: query, fragment: fragment }) - } - } +#[derive(Copy, Clone)] +pub enum SchemeType { + File, + SpecialNotFile, + NotSpecial, } +impl SchemeType { + fn is_special(&self) -> bool { + !matches!(*self, SchemeType::NotSpecial) + } -pub fn parse_scheme(input: &str, context: Context) -> Option<(String, &str)> { - if input.is_empty() || !starts_with_ascii_alpha(input) { - return None + fn is_file(&self) -> bool { + matches!(*self, SchemeType::File) } - for (i, c) in input.char_indices() { - match c { - 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.' => (), - ':' => return Some(( - input[..i].to_ascii_lowercase(), - &input[i + 1..], - )), - _ => return None, + + fn from(s: &str) -> Self { + match s { + "http" | "https" | "ws" | "wss" | "ftp" | "gopher" => SchemeType::SpecialNotFile, + "file" => SchemeType::File, + _ => SchemeType::NotSpecial, } } - // EOF before ':' - match context { - Context::Setter => Some((input.to_ascii_lowercase(), "")), - Context::UrlParser => None - } } - -fn parse_absolute_url<'a>(scheme: String, scheme_type: SchemeType, - input: &'a str, parser: &UrlParser) -> ParseResult { - // Authority first slash state - let remaining = try!(skip_slashes(input, parser)); - // Authority state - let (username, password, remaining) = try!(parse_userinfo(remaining, parser)); - // Host state - let (host, port, default_port, remaining) = try!(parse_host(remaining, scheme_type, parser)); - let (path, remaining) = try!(parse_path_start( - remaining, Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(RelativeSchemeData { - username: username, password: password, - host: host, port: port, default_port: default_port, - path: path }); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, query: query, fragment: fragment }) +pub fn default_port(scheme: &str) -> Option { + match scheme { + "http" | "ws" => Some(80), + "https" | "wss" => Some(443), + "ftp" => Some(21), + "gopher" => Some(70), + _ => None, + } } +pub struct Parser<'a> { + pub serialization: String, + pub base_url: Option<&'a Url>, + pub query_encoding_override: EncodingOverride, + pub log_syntax_violation: Option<&'a Fn(&'static str)>, +} -fn parse_relative_url<'a>(input: &'a str, scheme: String, scheme_type: SchemeType, - base: &RelativeSchemeData, base_query: &Option, - parser: &UrlParser) - -> ParseResult { - let mut chars = input.chars(); - match chars.next() { - Some('/') | Some('\\') => { - let ch = chars.next(); - // Relative slash state - if matches!(ch, Some('/') | Some('\\')) { - if ch == Some('\\') { - try!(parser.parse_error(ParseError::InvalidBackslash)) - } - if scheme_type == SchemeType::FileLike { - // File host state - let remaining = &input[2..]; - let (host, remaining) = if remaining.len() >= 2 - && starts_with_ascii_alpha(remaining) - && matches!(remaining.as_bytes()[1], b':' | b'|') - && (remaining.len() == 2 - || matches!(remaining.as_bytes()[2], - b'/' | b'\\' | b'?' | b'#')) - { - // Windows drive letter quirk - (Host::Domain(String::new()), remaining) - } else { - try!(parse_file_host(remaining, parser)) - }; - let (path, remaining) = try!(parse_path_start( - remaining, Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(RelativeSchemeData { - username: String::new(), password: None, - host: host, port: None, default_port: None, path: path - }); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) - } else { - parse_absolute_url(scheme, scheme_type, input, parser) - } - } else { - // Relative path state - let (path, remaining) = try!(parse_path( - &[], &input[1..], Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(if scheme_type == SchemeType::FileLike { - RelativeSchemeData { - username: String::new(), password: None, host: - Host::Domain(String::new()), port: None, default_port: None, path: path - } - } else { - RelativeSchemeData { - username: base.username.clone(), - password: base.password.clone(), - host: base.host.clone(), - port: base.port.clone(), - default_port: base.default_port.clone(), - path: path - } - }); - let (query, fragment) = try!( - parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) - } - }, - Some('?') => { - let (query, fragment) = try!(parse_query_and_fragment(input, parser)); - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: query, fragment: fragment }) - }, - Some('#') => { - let fragment = Some(try!(parse_fragment(&input[1..], parser))); - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: base_query.clone(), fragment: fragment }) - } - None => { - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: base_query.clone(), fragment: None }) - } - _ => { - let (scheme_data, remaining) = if scheme_type == SchemeType::FileLike - && input.len() >= 2 - && starts_with_ascii_alpha(input) - && matches!(input.as_bytes()[1], b':' | b'|') - && (input.len() == 2 - || matches!(input.as_bytes()[2], b'/' | b'\\' | b'?' | b'#')) - { - // Windows drive letter quirk - let (path, remaining) = try!(parse_path( - &[], input, Context::UrlParser, scheme_type, parser)); - (SchemeData::Relative(RelativeSchemeData { - username: String::new(), password: None, - host: Host::Domain(String::new()), - port: None, - default_port: None, - path: path - }), remaining) - } else { - let base_path = &base.path[..max(base.path.len(), 1) - 1]; - // Relative path state - let (path, remaining) = try!(parse_path( - base_path, input, Context::UrlParser, scheme_type, parser)); - (SchemeData::Relative(RelativeSchemeData { - username: base.username.clone(), - password: base.password.clone(), - host: base.host.clone(), - port: base.port.clone(), - default_port: base.default_port.clone(), - path: path - }), remaining) - }; - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) +impl<'a> Parser<'a> { + fn syntax_violation(&self, reason: &'static str) { + if let Some(log) = self.log_syntax_violation { + log(reason) } } -} - -fn skip_slashes<'a>(input: &'a str, parser: &UrlParser) -> ParseResult<&'a str> { - let first_non_slash = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); - if &input[..first_non_slash] != "//" { - try!(parser.parse_error(ParseError::ExpectedTwoSlashes)); + fn syntax_violation_if bool>(&self, reason: &'static str, test: F) { + // Skip test if not logging. + if let Some(log) = self.log_syntax_violation { + if test() { + log(reason) + } + } } - Ok(&input[first_non_slash..]) -} + /// https://url.spec.whatwg.org/#concept-basic-url-parser + pub fn parse_url(mut self, original_input: &str) -> ParseResult { + let input = original_input.trim_matches(c0_control_or_space); + if input.len() < original_input.len() { + self.syntax_violation("leading or trailing control or space character") + } + if let Ok(remaining) = self.parse_scheme(input, Context::UrlParser) { + return self.parse_with_scheme(remaining) + } -fn parse_userinfo<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(String, Option, &'a str)> { - let mut last_at = None; - for (i, c) in input.char_indices() { - match c { - '@' => { - if last_at.is_some() { - try!(parser.parse_error(ParseError::InvalidAtSymbolInUser)) + // No-scheme state + if let Some(base_url) = self.base_url { + if input.starts_with("#") { + self.fragment_only(base_url, input) + } else if base_url.non_relative { + Err(ParseError::RelativeUrlWithNonRelativeBase) + } else { + let scheme_type = SchemeType::from(base_url.scheme()); + if scheme_type.is_file() { + self.parse_file(input, Some(base_url)) + } else { + self.parse_relative(input, scheme_type, base_url) } - last_at = Some(i) - }, - '/' | '\\' | '?' | '#' => break, - _ => (), + } + } else { + Err(ParseError::RelativeUrlWithoutBase) } } - let (input, remaining) = match last_at { - Some(at) => (&input[..at], &input[at + 1..]), - None => return Ok((String::new(), None, input)), - }; - let mut username = String::new(); - let mut password = None; - for (i, c, next_i) in input.char_ranges() { - match c { - ':' => { - password = Some(try!(parse_password(&input[i + 1..], parser))); - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - // The spec says to use the default encode set, - // but also replaces '@' by '%40' in an earlier step. - utf8_percent_encode_to(&input[i..next_i], - USERINFO_ENCODE_SET, &mut username); + pub fn parse_scheme<'i>(&mut self, input: &'i str, context: Context) -> ParseResult<&'i str> { + if input.is_empty() || !input.starts_with(ascii_alpha) { + return Err(ParseError::InvalidScheme) + } + debug_assert!(self.serialization.is_empty()); + for (i, c) in input.char_indices() { + match c { + 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.' => { + self.serialization.push(c.to_ascii_lowercase()) + } + ':' => return Ok(&input[i + 1..]), + _ => { + self.serialization.clear(); + return Err(ParseError::InvalidScheme) + } + } + } + // EOF before ':' + match context { + Context::Setter => Ok(""), + Context::UrlParser => { + self.serialization.clear(); + Err(ParseError::InvalidScheme) } } } - Ok((username, password, remaining)) -} - -fn parse_password(input: &str, parser: &UrlParser) -> ParseResult { - let mut password = String::new(); - for (i, c, next_i) in input.char_ranges() { - match c { - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - // The spec says to use the default encode set, - // but also replaces '@' by '%40' in an earlier step. - utf8_percent_encode_to(&input[i..next_i], - USERINFO_ENCODE_SET, &mut password); + fn parse_with_scheme(mut self, input: &str) -> ParseResult { + let scheme_end = try!(to_u32(self.serialization.len())); + let scheme_type = SchemeType::from(&self.serialization); + self.serialization.push(':'); + match scheme_type { + SchemeType::File => { + self.syntax_violation_if("expected // after file:", || !input.starts_with("//")); + let base_file_url = self.base_url.and_then(|base| { + if base.scheme() == "file" { Some(base) } else { None } + }); + self.serialization.clear(); + self.parse_file(input, base_file_url) + } + SchemeType::SpecialNotFile => { + // special relative or authority state + let slashes_count = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); + if let Some(base_url) = self.base_url { + if slashes_count < 2 && + base_url.scheme() == &self.serialization[..scheme_end as usize] { + // Non-relative URLs only happen with "not special" schemes. + debug_assert!(!base_url.non_relative); + self.serialization.clear(); + return self.parse_relative(input, scheme_type, base_url) + } + } + // special authority slashes state + self.syntax_violation_if("expected //", || &input[..slashes_count] != "//"); + self.after_double_slash(&input[slashes_count..], scheme_type, scheme_end) } + SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end) } } - Ok(password) -} - - -pub fn parse_host<'a>(input: &'a str, scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Host, Option, Option, &'a str)> { - let (host, remaining) = try!(parse_hostname(input, parser)); - let (port, default_port, remaining) = if remaining.starts_with(":") { - try!(parse_port(&remaining[1..], scheme_type, parser)) - } else { - (None, scheme_type.default_port(), remaining) - }; - Ok((host, port, default_port, remaining)) -} + /// Scheme other than file, http, https, ws, ws, ftp, gopher. + fn parse_non_special(mut self, input: &str, scheme_type: SchemeType, scheme_end: u32) + -> ParseResult { + // path or authority state ( + if input.starts_with("//") { + return self.after_double_slash(&input[2..], scheme_type, scheme_end) + } + // Anarchist URL (no authority) + let path_start = try!(to_u32(self.serialization.len())); + let username_end = path_start; + let host_start = path_start; + let host_end = path_start; + let host = HostInternal::None; + let port = None; + let relative = input.starts_with("/"); + let remaining = if relative { + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(scheme_type, &mut false, path_start, &input[1..], Context::UrlParser) + } else { + self.parse_non_relative_path(input) + }; + self.with_query_and_fragment(!relative, scheme_end, username_end, host_start, + host_end, host, port, path_start, remaining) + } -pub fn parse_hostname<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(Host, &'a str)> { - let mut inside_square_brackets = false; - let mut host_input = String::new(); - let mut end = input.len(); - for (i, c) in input.char_indices() { + fn parse_file(mut self, input: &str, mut base_file_url: Option<&Url>) -> ParseResult { + // file state + debug_assert!(self.serialization.is_empty()); + let c = input.chars().next(); match c { - ':' if !inside_square_brackets => { - end = i; - break + None => { + if let Some(base_url) = base_file_url { + // Copy everything except the fragment + let before_fragment = match base_url.fragment_start { + Some(i) => &base_url.serialization[..i as usize], + None => &*base_url.serialization, + }; + self.serialization.push_str(before_fragment); + Ok(Url { + serialization: self.serialization, + fragment_start: None, + ..*base_url + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: None, + }) + } }, - '/' | '\\' | '?' | '#' => { - end = i; - break + Some('?') => { + if let Some(base_url) = base_file_url { + // Copy everything up to the query string + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(base_url.scheme_end, input)); + Ok(Url { + serialization: self.serialization, + query_start: query_start, + fragment_start: fragment_start, + ..*base_url + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, input)); + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) + } }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - c => { - match c { - '[' => inside_square_brackets = true, - ']' => inside_square_brackets = false, - _ => (), + Some('#') => { + if let Some(base_url) = base_file_url { + self.fragment_only(base_url, input) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + let fragment_start = "file:///".len() as u32; + self.parse_fragment(&input[1..]); + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: Some(fragment_start), + }) + } + } + Some('/') | Some('\\') => { + self.syntax_violation_if("backslash", || c == Some('\\')); + let input = &input[1..]; + // file slash state + let c = input.chars().next(); + self.syntax_violation_if("backslash", || c == Some('\\')); + if matches!(c, Some('/') | Some('\\')) { + // file host state + self.serialization.push_str("file://"); + let scheme_end = "file".len() as u32; + let host_start = "file://".len() as u32; + let (path_start, host, remaining) = try!(self.parse_file_host(&input[1..])); + let host_end = try!(to_u32(self.serialization.len())); + let mut has_host = !matches!(host, HostInternal::None); + let remaining = if path_start { + self.parse_path_start( + SchemeType::File, &mut has_host, remaining, Context::UrlParser) + } else { + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(SchemeType::File, &mut has_host, path_start, + remaining, Context::UrlParser) + }; + // FIXME: deal with has_host + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: host_start, + host_start: host_start, + host_end: host_end, + host: host, + port: None, + path_start: host_end, + query_start: query_start, + fragment_start: fragment_start, + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + if let Some(base_url) = base_file_url { + let first_segment = base_url.path_segments().unwrap().next().unwrap(); + // FIXME: *normalized* drive letter + if is_windows_drive_letter(first_segment) { + self.serialization.push_str(first_segment); + self.serialization.push('/'); + } + } + let remaining = self.parse_path( + SchemeType::File, &mut false, path_start, input, Context::UrlParser); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) + } + } + _ => { + if starts_with_windows_drive_letter_segment(input) { + base_file_url = None; + } + if let Some(base_url) = base_file_url { + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + self.pop_path(SchemeType::File, base_url.path_start as usize); + let remaining = self.parse_path( + SchemeType::File, &mut true, base_url.path_start as usize, + input, Context::UrlParser); + let non_relative = false; + self.with_query_and_fragment( + non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + let remaining = self.parse_path( + SchemeType::File, &mut false, path_start, input, Context::UrlParser); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + non_relative: false, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) } - host_input.push(c) } } } - let host = try!(Host::parse(&host_input)); - Ok((host, &input[end..])) -} - -pub fn parse_port<'a>(input: &'a str, scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Option, Option, &'a str)> { - let mut port = 0; - let mut has_any_digit = false; - let mut end = input.len(); - for (i, c) in input.char_indices() { - match c { - '0'...'9' => { - port = port * 10 + (c as u32 - '0' as u32); - if port > ::std::u16::MAX as u32 { - return Err(ParseError::InvalidPort) - } - has_any_digit = true; + fn parse_relative(mut self, input: &str, scheme_type: SchemeType, base_url: &Url) + -> ParseResult { + // relative state + debug_assert!(self.serialization.is_empty()); + match input.chars().next() { + None => { + // Copy everything except the fragment + let before_fragment = match base_url.fragment_start { + Some(i) => &base_url.serialization[..i as usize], + None => &*base_url.serialization, + }; + self.serialization.push_str(before_fragment); + Ok(Url { + serialization: self.serialization, + fragment_start: None, + ..*base_url + }) }, - '/' | '\\' | '?' | '#' => { - end = i; - break + Some('?') => { + // Copy everything up to the query string + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(base_url.scheme_end, input)); + Ok(Url { + serialization: self.serialization, + query_start: query_start, + fragment_start: fragment_start, + ..*base_url + }) }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => return Err(ParseError::InvalidPort) + Some('#') => self.fragment_only(base_url, input), + Some('/') | Some('\\') => { + let slashes_count = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); + if slashes_count >= 2 { + self.syntax_violation_if("expected //", || &input[..slashes_count] != "//"); + let scheme_end = base_url.scheme_end; + debug_assert!(base_url.byte_at(scheme_end) == b':'); + self.serialization.push_str(base_url.slice(..scheme_end + 1)); + return self.after_double_slash(&input[slashes_count..], scheme_type, scheme_end) + } + let path_start = base_url.path_start; + debug_assert!(base_url.byte_at(path_start) == b'/'); + self.serialization.push_str(base_url.slice(..path_start + 1)); + let remaining = self.parse_path( + scheme_type, &mut true, path_start as usize, &input[1..], Context::UrlParser); + let non_relative = false; + self.with_query_and_fragment( + non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } + _ => { + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + // FIXME spec says just "remove last entry", not the "pop" algorithm + self.pop_path(scheme_type, base_url.path_start as usize); + let remaining = self.parse_path( + scheme_type, &mut true, base_url.path_start as usize, input, Context::UrlParser); + let non_relative = false; + self.with_query_and_fragment( + non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } } } - let default_port = scheme_type.default_port(); - let mut port = Some(port as u16); - if !has_any_digit || port == default_port { - port = None; - } - Ok((port, default_port, &input[end..])) -} + fn after_double_slash(mut self, input: &str, scheme_type: SchemeType, scheme_end: u32) + -> ParseResult { + self.serialization.push('/'); + self.serialization.push('/'); + let non_relative = false; + // authority state + let (username_end, remaining) = try!(self.parse_userinfo(input, scheme_type)); + // host state + let host_start = try!(to_u32(self.serialization.len())); + let (host_end, host, port, remaining) = + try!(self.parse_host_and_port(remaining, scheme_end, scheme_type)); + // path state + let path_start = try!(to_u32(self.serialization.len())); + let remaining = self.parse_path_start( + scheme_type, &mut true, remaining, Context::UrlParser); + self.with_query_and_fragment(non_relative, scheme_end, username_end, host_start, + host_end, host, port, path_start, remaining) + } -fn parse_file_host<'a>(input: &'a str, parser: &UrlParser) -> ParseResult<(Host, &'a str)> { - let mut host_input = String::new(); - let mut end = input.len(); - for (i, c) in input.char_indices() { - match c { - '/' | '\\' | '?' | '#' => { - end = i; - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => host_input.push(c) + /// Return (username_end, remaining) + fn parse_userinfo<'i>(&mut self, input: &'i str, scheme_type: SchemeType) + -> ParseResult<(u32, &'i str)> { + let mut last_at = None; + for (i, c) in input.char_indices() { + match c { + '@' => { + if last_at.is_some() { + self.syntax_violation("unencoded @ sign in username or password") + } else { + self.syntax_violation( + "embedding authentification information (username or password) \ + in an URL is not recommended") + } + last_at = Some(i) + }, + '/' | '?' | '#' => break, + '\\' if scheme_type.is_special() => break, + _ => (), + } } + let (input, remaining) = match last_at { + None => return Ok((try!(to_u32(self.serialization.len())), input)), + Some(0) => return Ok((try!(to_u32(self.serialization.len())), &input[1..])), + Some(at) => (&input[..at], &input[at + 1..]), + }; + + let mut username_end = None; + for (i, c, next_i) in input.char_ranges() { + match c { + ':' if username_end.is_none() => { + // Start parsing password + username_end = Some(try!(to_u32(self.serialization.len()))); + self.serialization.push(':'); + }, + '\t' | '\n' | '\r' => {}, + _ => { + self.check_url_code_point(input, i, c); + let utf8_c = &input[i..next_i]; + utf8_percent_encode_to(utf8_c, USERINFO_ENCODE_SET, &mut self.serialization); + } + } + } + let username_end = match username_end { + Some(i) => i, + None => try!(to_u32(self.serialization.len())), + }; + self.serialization.push('@'); + Ok((username_end, remaining)) } - let host = if host_input.is_empty() { - Host::Domain(String::new()) - } else { - try!(Host::parse(&host_input)) - }; - Ok((host, &input[end..])) -} - -pub fn parse_standalone_path(input: &str, parser: &UrlParser) - -> ParseResult<(Vec, Option, Option)> { - if !input.starts_with("/") { - if input.starts_with("\\") { - try!(parser.parse_error(ParseError::InvalidBackslash)); + pub fn parse_host_and_port<'i>(&mut self, input: &'i str, + scheme_end: u32, scheme_type: SchemeType) + -> ParseResult<(u32, HostInternal, Option, &'i str)> { + let (host, remaining) = try!(self.parse_host(input, scheme_type)); + let host_end = try!(to_u32(self.serialization.len())); + let (port, remaining) = if remaining.starts_with(":") { + try!(self.parse_port(&remaining[1..], scheme_end)) } else { - return Err(ParseError::ExpectedInitialSlash) - } + (None, remaining) + }; + Ok((host_end, host, port, remaining)) } - let (path, remaining) = try!(parse_path( - &[], &input[1..], Context::UrlParser, SchemeType::Relative(0), parser)); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok((path, query, fragment)) -} - -pub fn parse_path_start<'a>(input: &'a str, context: Context, scheme_type: SchemeType, - parser: &UrlParser) - -> ParseResult<(Vec, &'a str)> { - let mut i = 0; - // Relative path start state - match input.chars().next() { - Some('/') => i = 1, - Some('\\') => { - try!(parser.parse_error(ParseError::InvalidBackslash)); - i = 1; - }, - _ => () - } - parse_path(&[], &input[i..], context, scheme_type, parser) -} - - -fn parse_path<'a>(base_path: &[String], input: &'a str, context: Context, - scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Vec, &'a str)> { - // Relative path state - let mut path = base_path.to_vec(); - let mut iter = input.char_ranges(); - let mut end; - loop { - let mut path_part = String::new(); - let mut ends_with_slash = false; - end = input.len(); - while let Some((i, c, next_i)) = iter.next() { - match c { - '/' => { - ends_with_slash = true; + pub fn parse_host<'i>(&mut self, input: &'i str, scheme_type: SchemeType) + -> ParseResult<(HostInternal, &'i str)> { + let mut inside_square_brackets = false; + let mut has_ignored_chars = false; + let mut end = input.len(); + for (i, b) in input.bytes().enumerate() { + match b { + b':' if !inside_square_brackets => { end = i; break }, - '\\' => { - try!(parser.parse_error(ParseError::InvalidBackslash)); - ends_with_slash = true; + b'/' | b'?' | b'#' => { end = i; break - }, - '?' | '#' if context == Context::UrlParser => { + } + b'\\' if scheme_type.is_special() => { end = i; break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - DEFAULT_ENCODE_SET, &mut path_part); } + b'\t' | b'\n' | b'\r' => { + self.syntax_violation("invalid character"); + has_ignored_chars = true; + } + b'[' => inside_square_brackets = true, + b']' => inside_square_brackets = false, + _ => {} } } - match &*path_part { - ".." | ".%2e" | ".%2E" | "%2e." | "%2E." | - "%2e%2e" | "%2E%2e" | "%2e%2E" | "%2E%2E" => { - path.pop(); - if !ends_with_slash { - path.push(String::new()); + let replaced: String; + let host_input = if has_ignored_chars { + replaced = input[..end].chars().filter(|&c| !matches!(c, '\t' | '\n' | '\r')).collect(); + &*replaced + } else { + &input[..end] + }; + if scheme_type.is_special() && host_input.is_empty() { + return Err(ParseError::EmptyHost) + } + let host = try!(host::parse(&host_input, &mut self.serialization)); + Ok((host, &input[end..])) + } + + pub fn parse_file_host<'i>(&mut self, input: &'i str) + -> ParseResult<(bool, HostInternal, &'i str)> { + let mut has_ignored_chars = false; + let mut end = input.len(); + for (i, b) in input.bytes().enumerate() { + match b { + b'/' | b'\\' | b'?' | b'#' => { + end = i; + break } - }, - "." | "%2e" | "%2E" => { - if !ends_with_slash { - path.push(String::new()); + b'\t' | b'\n' | b'\r' => { + self.syntax_violation("invalid character"); + has_ignored_chars = true; } - }, - _ => { - if scheme_type == SchemeType::FileLike - && path.is_empty() - && path_part.len() == 2 - && starts_with_ascii_alpha(&path_part) - && path_part.as_bytes()[1] == b'|' { - // Windows drive letter quirk - unsafe { - path_part.as_mut_vec()[1] = b':' - } + _ => {} + } + } + let replaced: String; + let host_input = if has_ignored_chars { + replaced = input[..end].chars().filter(|&c| !matches!(c, '\t' | '\n' | '\r')).collect(); + &*replaced + } else { + &input[..end] + }; + if is_windows_drive_letter(host_input) { + return Ok((false, HostInternal::None, input)) + } + let mut host; + if host_input.is_empty() { + host = HostInternal::None; + } else { + let host_start = self.serialization.len(); + host = try!(host::parse(&host_input, &mut self.serialization)); + if &self.serialization[host_start..] == "localhost" { + host = HostInternal::None; + self.serialization.truncate(host_start); + } + } + Ok((true, host, &input[end..])) + } + + pub fn parse_port<'i>(&mut self, input: &'i str, scheme_end: u32) + -> ParseResult<(Option, &'i str)> { + let mut port = 0; + let mut has_any_digit = false; + let mut end = input.len(); + for (i, c) in input.char_indices() { + if let Some(digit) = c.to_digit(10) { + port = port * 10 + digit; + if port > ::std::u16::MAX as u32 { + return Err(ParseError::InvalidPort) + } + has_any_digit = true; + } else { + match c { + '/' | '\\' | '?' | '#' => { + end = i; + break + }, + '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + _ => return Err(ParseError::InvalidPort) } - path.push(path_part) } } - if !ends_with_slash { - break + let mut opt_port = Some(port as u16); + if !has_any_digit || opt_port == default_port(&self.serialization[..scheme_end as usize]) { + opt_port = None; + } else { + self.serialization.push(':'); + write!(&mut self.serialization, "{}", port).unwrap(); } + return Ok((opt_port, &input[end..])) } - Ok((path, &input[end..])) -} + fn parse_path_start<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + mut input: &'i str, context: Context) + -> &'i str { + // Path start state + let mut iter = input.chars(); + match iter.next() { + Some('/') => input = iter.as_str(), + Some('\\') => { + self.syntax_violation("backslash"); + input = iter.as_str() + } + _ => {} + } + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(scheme_type, has_host, path_start, input, context) + } -fn parse_scheme_data<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(String, &'a str)> { - let mut scheme_data = String::new(); - let mut end = input.len(); - for (i, c, next_i) in input.char_ranges() { - match c { - '?' | '#' => { - end = i; + fn parse_path<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + path_start: usize, input: &'i str, context: Context) + -> &'i str { + // Relative path state + debug_assert!(self.serialization.ends_with("/")); + let mut iter = input.char_ranges(); + let mut end; + loop { + let segment_start = self.serialization.len(); + let mut ends_with_slash = false; + end = input.len(); + while let Some((i, c, next_i)) = iter.next() { + match c { + '/' => { + ends_with_slash = true; + end = i; + break + }, + '\\' if scheme_type.is_special() => { + self.syntax_violation("backslash"); + ends_with_slash = true; + end = i; + break + }, + '?' | '#' if context == Context::UrlParser => { + end = i; + break + }, + '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), + _ => { + self.check_url_code_point(input, i, c); + utf8_percent_encode_to( + &input[i..next_i], DEFAULT_ENCODE_SET, &mut self.serialization); + } + } + } + match &self.serialization[segment_start..] { + ".." | ".%2e" | ".%2E" | "%2e." | "%2E." | + "%2e%2e" | "%2E%2e" | "%2e%2E" | "%2E%2E" => { + debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/'); + self.serialization.truncate(segment_start - 1); // Truncate "/.." + self.pop_path(scheme_type, path_start); + if !self.serialization[path_start..].ends_with("/") { + self.serialization.push('/') + } + }, + "." | "%2e" | "%2E" => { + self.serialization.truncate(segment_start); + }, + _ => { + if scheme_type.is_file() && is_windows_drive_letter( + &self.serialization[path_start + 1..] + ) { + unsafe { + *self.serialization.as_mut_vec().last_mut().unwrap() = b':' + } + if *has_host { + self.syntax_violation("file: with host and Windows drive letter"); + *has_host = false; // FIXME account for this in callers + } + } + if ends_with_slash { + self.serialization.push('/') + } + } + } + if !ends_with_slash { break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - SIMPLE_ENCODE_SET, &mut scheme_data); } } + &input[end..] } - Ok((scheme_data, &input[end..])) -} + /// https://url.spec.whatwg.org/#pop-a-urls-path + fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) { + if self.serialization.len() > path_start { + let slash_position = self.serialization[path_start..].rfind('/').unwrap(); + // + 1 since rfind returns the position before the slash. + let segment_start = path_start + slash_position + 1; + // Don’t pop a Windows drive letter + // FIXME: *normalized* Windows drive letter + if !( + scheme_type.is_file() && + is_windows_drive_letter(&self.serialization[segment_start..]) + ) { + self.serialization.truncate(segment_start); + } + } -fn parse_query_and_fragment(input: &str, parser: &UrlParser) - -> ParseResult<(Option, Option)> { - match input.chars().next() { - Some('#') => Ok((None, Some(try!(parse_fragment(&input[1..], parser))))), - Some('?') => { - let (query, remaining) = try!(parse_query( - &input[1..], Context::UrlParser, parser)); - let fragment = match remaining { - Some(remaining) => Some(try!(parse_fragment(remaining, parser))), - None => None - }; - Ok((Some(query), fragment)) - }, - None => Ok((None, None)), - _ => panic!("Programming error. parse_query_and_fragment() should not \ - have been called with input \"{}\"", input) } -} - -pub fn parse_query<'a>(input: &'a str, context: Context, parser: &UrlParser) - -> ParseResult<(String, Option<&'a str>)> { - let mut query = String::new(); - let mut remaining = None; - for (i, c) in input.char_indices() { - match c { - '#' if context == Context::UrlParser => { - remaining = Some(&input[i + 1..]); - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - query.push(c); + fn parse_non_relative_path<'i>(&mut self, input: &'i str) -> &'i str { + for (i, c, next_i) in input.char_ranges() { + match c { + '?' | '#' => return &input[i..], + '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + _ => { + self.check_url_code_point(input, i, c); + utf8_percent_encode_to( + &input[i..next_i], SIMPLE_ENCODE_SET, &mut self.serialization); + } } } + "" } - let query_bytes = parser.query_encoding_override.encode(&query); - Ok((percent_encode(&query_bytes, QUERY_ENCODE_SET), remaining)) -} + fn with_query_and_fragment(mut self, non_relative: bool, scheme_end: u32, username_end: u32, + host_start: u32, host_end: u32, host: HostInternal, + port: Option, path_start: u32, remaining: &str) + -> ParseResult { + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + Ok(Url { + serialization: self.serialization, + non_relative: non_relative, + scheme_end: scheme_end, + username_end: username_end, + host_start: host_start, + host_end: host_end, + host: host, + port: port, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start + }) + } + /// Return (query_start, fragment_start) + fn parse_query_and_fragment(&mut self, scheme_end: u32, mut input: &str) + -> ParseResult<(Option, Option)> { + let mut query_start = None; + match input.chars().next() { + Some('#') => {} + Some('?') => { + query_start = Some(try!(to_u32(self.serialization.len()))); + self.serialization.push('?'); + let remaining = self.parse_query(scheme_end, &input[1..], Context::UrlParser); + if let Some(remaining) = remaining { + input = remaining + } else { + return Ok((query_start, None)) + } + } + None => return Ok((None, None)), + _ => panic!("Programming error. parse_query_and_fragment() should not \ + have been called with input \"{}\"", input) + }; + + let fragment_start = try!(to_u32(self.serialization.len())); + self.serialization.push('#'); + debug_assert!(input.starts_with("#")); + self.parse_fragment(&input[1..]); + Ok((query_start, Some(fragment_start))) + } -pub fn parse_fragment<'a>(input: &'a str, parser: &UrlParser) -> ParseResult { - let mut fragment = String::new(); - for (i, c, next_i) in input.char_ranges() { - match c { - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - SIMPLE_ENCODE_SET, &mut fragment); + pub fn parse_query<'i>(&mut self, scheme_end: u32, input: &'i str, context: Context) + -> Option<&'i str> { + let mut query = String::new(); // FIXME: use a streaming decoder instead + let mut remaining = None; + for (i, c) in input.char_indices() { + match c { + '#' if context == Context::UrlParser => { + remaining = Some(&input[i..]); + break + }, + '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), + _ => { + self.check_url_code_point(input, i, c); + query.push(c); + } } } + + let encoding = match &self.serialization[..scheme_end as usize] { + "http" | "https" | "file" | "ftp" | "gopher" => self.query_encoding_override, + _ => EncodingOverride::utf8(), + }; + let query_bytes = encoding.encode(&query); + percent_encode_to(&query_bytes, QUERY_ENCODE_SET, &mut self.serialization); + remaining } - Ok(fragment) -} + fn fragment_only(mut self, base_url: &Url, input: &str) -> ParseResult { + let before_fragment = match base_url.fragment_start { + Some(i) => base_url.slice(..i), + None => &*base_url.serialization, + }; + debug_assert!(self.serialization.is_empty()); + self.serialization.reserve(before_fragment.len() + input.len()); + self.serialization.push_str(before_fragment); + self.serialization.push('#'); + debug_assert!(input.starts_with("#")); + self.parse_fragment(&input[1..]); + Ok(Url { + serialization: self.serialization, + fragment_start: Some(try!(to_u32(before_fragment.len()))), + ..*base_url + }) + } -#[inline] -pub fn starts_with_ascii_alpha(string: &str) -> bool { - matches!(string.as_bytes()[0], b'a'...b'z' | b'A'...b'Z') + pub fn parse_fragment(&mut self, input: &str) { + for (i, c, next_i) in input.char_ranges() { + match c { + '\0' | '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + _ => { + self.check_url_code_point(input, i, c); + utf8_percent_encode_to( + &input[i..next_i], SIMPLE_ENCODE_SET, &mut self.serialization); + } + } + } + } + + fn check_url_code_point(&self, input: &str, i: usize, c: char) { + if let Some(log) = self.log_syntax_violation { + if c == '%' { + if !starts_with_2_hex(&input[i + 1..]) { + log("expected 2 hex digits after %") + } + } else if !is_url_code_point(c) { + log("non-URL code point") + } + } + } } #[inline] @@ -674,6 +1005,13 @@ fn starts_with_2_hex(input: &str) -> bool { && is_ascii_hex_digit(input.as_bytes()[1]) } +// Non URL code points: +// U+0000 to U+0020 (space) +// " # % < > [ \ ] ^ ` { | } +// U+007F to U+009F +// surrogates +// U+FDD0 to U+FDEF +// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex #[inline] fn is_url_code_point(c: char) -> bool { matches!(c, @@ -693,20 +1031,11 @@ fn is_url_code_point(c: char) -> bool { '\u{F0000}'...'\u{FFFFD}' | '\u{100000}'...'\u{10FFFD}') } -// Non URL code points: -// U+0000 to U+0020 (space) -// " # % < > [ \ ] ^ ` { | } -// U+007F to U+009F -// surrogates -// U+FDD0 to U+FDEF -// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex - pub trait StrCharRanges<'a> { fn char_ranges(&self) -> CharRanges<'a>; } - impl<'a> StrCharRanges<'a> for &'a str { #[inline] fn char_ranges(&self) -> CharRanges<'a> { @@ -735,15 +1064,41 @@ impl<'a> Iterator for CharRanges<'a> { } } +/// https://url.spec.whatwg.org/#c0-controls-and-space #[inline] -fn check_url_code_point(input: &str, i: usize, c: char, parser: &UrlParser) - -> ParseResult<()> { - if c == '%' { - if !starts_with_2_hex(&input[i + 1..]) { - try!(parser.parse_error(ParseError::InvalidPercentEncoded)); - } - } else if !is_url_code_point(c) { - try!(parser.parse_error(ParseError::NonUrlCodePoint)); +fn c0_control_or_space(ch: char) -> bool { + ch <= ' ' // U+0000 to U+0020 +} + +/// https://url.spec.whatwg.org/#ascii-alpha +#[inline] +pub fn ascii_alpha(ch: char) -> bool { + matches!(ch, 'a'...'z' | 'A'...'Z') +} + +#[inline] +fn to_u32(i: usize) -> ParseResult { + if i <= ::std::u32::MAX as usize { + Ok(i as u32) + } else { + Err(ParseError::Overflow) } - Ok(()) +} + +/// Wether the scheme is file:, the path has a single segment, and that segment +/// is a Windows drive letter +fn is_windows_drive_letter(segment: &str) -> bool { + segment.len() == 2 + && starts_with_windows_drive_letter(segment) +} + +fn starts_with_windows_drive_letter(s: &str) -> bool { + ascii_alpha(s.as_bytes()[0] as char) + && matches!(s.as_bytes()[1], b':' | b'|') +} + +fn starts_with_windows_drive_letter_segment(s: &str) -> bool { + s.len() >= 3 + && starts_with_windows_drive_letter(s) + && matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#') } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 9ebcfe3e..5805b10d 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -95,18 +95,15 @@ define_encode_set! { } define_encode_set! { - /// This encode set is used in the URL parser for usernames and passwords. - pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'@'} + /// This encode set is used for username and password. + pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%'} } define_encode_set! { - /// This encode set should be used when setting the password field of a parsed URL. - pub PASSWORD_ENCODE_SET = [USERINFO_ENCODE_SET] | {'\\', '/'} -} - -define_encode_set! { - /// This encode set should be used when setting the username field of a parsed URL. - pub USERNAME_ENCODE_SET = [PASSWORD_ENCODE_SET] | {':'} + /// This encode set is used for username and password. + pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | { + '/', ':', ';', '=', '@', '[', '\\', ']', '^', '|' + } } define_encode_set! { diff --git a/src/urlutils.rs b/src/urlutils.rs deleted file mode 100644 index cd57b501..00000000 --- a/src/urlutils.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - - -//! These methods are not meant for use in Rust code, -//! only to help implement the JavaScript URLUtils API: http://url.spec.whatwg.org/#urlutils - -use super::{Url, UrlParser, SchemeType, SchemeData, RelativeSchemeData}; -use parser::{ParseError, ParseResult, Context}; -use percent_encoding::{utf8_percent_encode_to, USERNAME_ENCODE_SET, PASSWORD_ENCODE_SET}; - - -#[allow(dead_code)] -pub struct UrlUtilsWrapper<'a> { - pub url: &'a mut Url, - pub parser: &'a UrlParser<'a>, -} - -#[doc(hidden)] -pub trait UrlUtils { - fn set_scheme(&mut self, input: &str) -> ParseResult<()>; - fn set_username(&mut self, input: &str) -> ParseResult<()>; - fn set_password(&mut self, input: &str) -> ParseResult<()>; - fn set_host_and_port(&mut self, input: &str) -> ParseResult<()>; - fn set_host(&mut self, input: &str) -> ParseResult<()>; - fn set_port(&mut self, input: &str) -> ParseResult<()>; - fn set_path(&mut self, input: &str) -> ParseResult<()>; - fn set_query(&mut self, input: &str) -> ParseResult<()>; - fn set_fragment(&mut self, input: &str) -> ParseResult<()>; -} - -impl<'a> UrlUtils for UrlUtilsWrapper<'a> { - /// `URLUtils.protocol` setter - fn set_scheme(&mut self, input: &str) -> ParseResult<()> { - match ::parser::parse_scheme(input, Context::Setter) { - Some((scheme, _)) => { - if self.parser.get_scheme_type(&self.url.scheme).same_as(self.parser.get_scheme_type(&scheme)) { - return Err(ParseError::InvalidScheme); - } - self.url.scheme = scheme; - Ok(()) - }, - None => Err(ParseError::InvalidScheme), - } - } - - /// `URLUtils.username` setter - fn set_username(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut username, .. }) => { - username.truncate(0); - utf8_percent_encode_to(input, USERNAME_ENCODE_SET, username); - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetUsernameWithNonRelativeScheme) - } - } - - /// `URLUtils.password` setter - fn set_password(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut password, .. }) => { - if input.len() == 0 { - *password = None; - return Ok(()); - } - let mut new_password = String::new(); - utf8_percent_encode_to(input, PASSWORD_ENCODE_SET, &mut new_password); - *password = Some(new_password); - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPasswordWithNonRelativeScheme) - } - } - - /// `URLUtils.host` setter - fn set_host_and_port(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { - ref mut host, ref mut port, ref mut default_port, .. - }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - let (new_host, new_port, new_default_port, _) = try!(::parser::parse_host( - input, scheme_type, self.parser)); - *host = new_host; - *port = new_port; - *default_port = new_default_port; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetHostPortWithNonRelativeScheme) - } - } - - /// `URLUtils.hostname` setter - fn set_host(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut host, .. }) => { - let (new_host, _) = try!(::parser::parse_hostname(input, self.parser)); - *host = new_host; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetHostWithNonRelativeScheme) - } - } - - /// `URLUtils.port` setter - fn set_port(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut port, ref mut default_port, .. }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - if scheme_type == SchemeType::FileLike { - return Err(ParseError::CannotSetPortWithFileLikeScheme); - } - let (new_port, new_default_port, _) = try!(::parser::parse_port( - input, scheme_type, self.parser)); - *port = new_port; - *default_port = new_default_port; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPortWithNonRelativeScheme) - } - } - - /// `URLUtils.pathname` setter - fn set_path(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut path, .. }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - let (new_path, _) = try!(::parser::parse_path_start( - input, Context::Setter, scheme_type, self.parser)); - *path = new_path; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPathWithNonRelativeScheme) - } - } - - /// `URLUtils.search` setter - fn set_query(&mut self, input: &str) -> ParseResult<()> { - self.url.query = if input.is_empty() { - None - } else { - let input = if input.starts_with("?") { &input[1..] } else { input }; - let (new_query, _) = try!(::parser::parse_query( - input, Context::Setter, self.parser)); - Some(new_query) - }; - Ok(()) - } - - /// `URLUtils.hash` setter - fn set_fragment(&mut self, input: &str) -> ParseResult<()> { - if self.url.scheme == "javascript" { - return Err(ParseError::CannotSetJavascriptFragment) - } - self.url.fragment = if input.is_empty() { - None - } else { - let input = if input.starts_with("#") { &input[1..] } else { input }; - Some(try!(::parser::parse_fragment(input, self.parser))) - }; - Ok(()) - } -} diff --git a/tests/format.rs b/tests/format.rs deleted file mode 100644 index 39aac62a..00000000 --- a/tests/format.rs +++ /dev/null @@ -1,67 +0,0 @@ -extern crate url; - -use url::{Url, Host}; -use url::format::{PathFormatter, UserInfoFormatter}; - -#[test] -fn path_formatting() { - let data = [ - (vec![], "/"), - (vec![""], "/"), - (vec!["test", "path"], "/test/path"), - (vec!["test", "path", ""], "/test/path/") - ]; - for &(ref path, result) in &data { - assert_eq!(PathFormatter { - path: path - }.to_string(), result.to_string()); - } -} - -#[test] -fn host() { - // libstd’s `Display for Ipv6Addr` serializes 0:0:0:0:0:0:_:_ and 0:0:0:0:0:ffff:_:_ - // using IPv4-like syntax, as suggested in https://tools.ietf.org/html/rfc5952#section-4 - // but https://url.spec.whatwg.org/#concept-ipv6-serializer specifies not to. - - // Not [::0.0.0.2] / [::ffff:0.0.0.2] - assert_eq!(Host::parse("[0::2]").unwrap().to_string(), "[::2]"); - assert_eq!(Host::parse("[0::ffff:0:2]").unwrap().to_string(), "[::ffff:0:2]"); -} - -#[test] -fn userinfo_formatting() { - // Test data as (username, password, result) tuples. - let data = [ - ("", None, ""), - ("", Some(""), ":@"), - ("", Some("password"), ":password@"), - ("username", None, "username@"), - ("username", Some(""), "username:@"), - ("username", Some("password"), "username:password@") - ]; - for &(username, password, result) in &data { - assert_eq!(UserInfoFormatter { - username: username, - password: password - }.to_string(), result.to_string()); - } -} - -#[test] -fn relative_scheme_url_formatting() { - let data = [ - ("http://example.com/", "http://example.com/"), - ("http://addslash.com", "http://addslash.com/"), - ("http://@emptyuser.com/", "http://emptyuser.com/"), - ("http://:@emptypass.com/", "http://:@emptypass.com/"), - ("http://user@user.com/", "http://user@user.com/"), - ("http://user:pass@userpass.com/", "http://user:pass@userpass.com/"), - ("http://slashquery.com/path/?q=something", "http://slashquery.com/path/?q=something"), - ("http://noslashquery.com/path?q=something", "http://noslashquery.com/path?q=something") - ]; - for &(input, result) in &data { - let url = Url::parse(input).unwrap(); - assert_eq!(url.to_string(), result.to_string()); - } -} diff --git a/tests/tests.rs b/tests/tests.rs index 11d35cde..c363538f 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -9,15 +9,28 @@ extern crate url; use std::net::{Ipv4Addr, Ipv6Addr}; +use std::path::{Path, PathBuf}; use url::{Host, Url}; +macro_rules! assert_from_file_path { + ($path: expr) => { assert_from_file_path!($path, $path) }; + ($path: expr, $url_path: expr) => {{ + let url = Url::from_file_path(Path::new($path)).unwrap(); + assert_eq!(url.host(), None); + assert_eq!(url.path(), $url_path); + assert_eq!(url.to_file_path(), Ok(PathBuf::from($path))); + }}; +} + + + #[test] fn new_file_paths() { - use std::path::{Path, PathBuf}; if cfg!(unix) { assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); assert_eq!(Url::from_file_path(Path::new("../relative")), Err(())); - } else { + } + if cfg!(windows) { assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); assert_eq!(Url::from_file_path(Path::new(r"..\relative")), Err(())); assert_eq!(Url::from_file_path(Path::new(r"\drive-relative")), Err(())); @@ -25,16 +38,9 @@ fn new_file_paths() { } if cfg!(unix) { - let mut url = Url::from_file_path(Path::new("/foo/bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["foo".to_string(), "bar".to_string()][..])); - assert!(url.to_file_path() == Ok(PathBuf::from("/foo/bar"))); - - url.path_mut().unwrap()[1] = "ba\0r".to_string(); - url.to_file_path().is_ok(); - - url.path_mut().unwrap()[1] = "ba%00r".to_string(); - url.to_file_path().is_ok(); + assert_from_file_path!("/foo/bar"); + assert_from_file_path!("/foo/ba\0r", "/foo/ba%00r"); + assert_from_file_path!("/foo/ba%00r", "/foo/ba%2500r"); } } @@ -43,9 +49,8 @@ fn new_file_paths() { fn new_path_bad_utf8() { use std::ffi::OsStr; use std::os::unix::prelude::*; - use std::path::{Path, PathBuf}; - let url = Url::from_file_path(Path::new("/foo/ba%80r")).unwrap(); + let url = Url::from_file_path(Path::new(OsStr::from_bytes(b"/foo/ba\x80r"))).unwrap(); let os_str = OsStr::from_bytes(b"/foo/ba\x80r"); assert_eq!(url.to_file_path(), Ok(PathBuf::from(os_str))); } @@ -53,22 +58,11 @@ fn new_path_bad_utf8() { #[test] fn new_path_windows_fun() { if cfg!(windows) { - use std::path::{Path, PathBuf}; - let mut url = Url::from_file_path(Path::new(r"C:\foo\bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["C:".to_string(), "foo".to_string(), "bar".to_string()][..])); - assert_eq!(url.to_file_path(), - Ok(PathBuf::from(r"C:\foo\bar"))); - - url.path_mut().unwrap()[2] = "ba\0r".to_string(); - assert!(url.to_file_path().is_ok()); - - url.path_mut().unwrap()[2] = "ba%00r".to_string(); - assert!(url.to_file_path().is_ok()); + assert_from_file_path!(r"C:\foo\bar", "/C:/foo/bar"); + assert_from_file_path!("C:\\foo\\ba\0r", "/C:/foo/ba%00r"); // Invalid UTF-8 - url.path_mut().unwrap()[2] = "ba%80r".to_string(); - assert!(url.to_file_path().is_err()); + assert!(Url::parse("file:///C:/foo/ba%80r").unwrap().to_file_path().is_err()); // test windows canonicalized path let path = PathBuf::from(r"\\?\C:\foo\bar"); @@ -79,26 +73,23 @@ fn new_path_windows_fun() { #[test] fn new_directory_paths() { - use std::path::Path; - if cfg!(unix) { assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); assert_eq!(Url::from_directory_path(Path::new("../relative")), Err(())); let url = Url::from_directory_path(Path::new("/foo/bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["foo".to_string(), "bar".to_string(), - "".to_string()][..])); - } else { + assert_eq!(url.host(), None); + assert_eq!(url.path(), "/foo/bar/"); + } + if cfg!(windows) { assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); assert_eq!(Url::from_directory_path(Path::new(r"..\relative")), Err(())); assert_eq!(Url::from_directory_path(Path::new(r"\drive-relative")), Err(())); assert_eq!(Url::from_directory_path(Path::new(r"\\ucn\")), Err(())); let url = Url::from_directory_path(Path::new(r"C:\foo\bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["C:".to_string(), "foo".to_string(), - "bar".to_string(), "".to_string()][..])); + assert_eq!(url.host(), None); + assert_eq!(url.path(), "/C:/foo/bar/"); } } @@ -110,15 +101,15 @@ fn from_str() { #[test] fn issue_124() { let url: Url = "file:a".parse().unwrap(); - assert_eq!(url.path().unwrap(), ["a"]); + assert_eq!(url.path(), "/a"); let url: Url = "file:...".parse().unwrap(); - assert_eq!(url.path().unwrap(), ["..."]); + assert_eq!(url.path(), "/..."); let url: Url = "file:..".parse().unwrap(); - assert_eq!(url.path().unwrap(), [""]); + assert_eq!(url.path(), "/"); } #[test] -fn relative_scheme_data_equality() { +fn test_equality() { use std::hash::{Hash, Hasher, SipHasher}; fn check_eq(a: &Url, b: &Url) { @@ -145,7 +136,7 @@ fn relative_scheme_data_equality() { // Different ports let a: Url = url("http://example.com/"); let b: Url = url("http://example.com:8080/"); - assert!(a != b); + assert!(a != b, "{:?} != {:?}", a, b); // Different scheme let a: Url = url("http://example.com/"); @@ -165,27 +156,55 @@ fn relative_scheme_data_equality() { #[test] fn host() { - let a = Host::parse("www.mozilla.org").unwrap(); - let b = Host::parse("1.35.33.49").unwrap(); - let c = Host::parse("[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]").unwrap(); - let d = Host::parse("1.35.+33.49").unwrap(); - assert_eq!(a, Host::Domain("www.mozilla.org".to_owned())); - assert_eq!(b, Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert_eq!(c, Host::Ipv6(Ipv6Addr::new(0x2001, 0x0db8, 0x85a3, 0x08d3, - 0x1319, 0x8a2e, 0x0370, 0x7344))); - assert_eq!(d, Host::Domain("1.35.+33.49".to_owned())); - assert_eq!(Host::parse("[::]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 0))); - assert_eq!(Host::parse("[::1]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))); - assert_eq!(Host::parse("0x1.0X23.0x21.061").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert_eq!(Host::parse("0x1232131").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert!(Host::parse("42.0x1232131").is_err()); - assert_eq!(Host::parse("111").unwrap(), Host::Ipv4(Ipv4Addr::new(0, 0, 0, 111))); - assert_eq!(Host::parse("2..2.3").unwrap(), Host::Domain("2..2.3".to_owned())); - assert!(Host::parse("192.168.0.257").is_err()); + fn assert_host(input: &str, host: Host<&str>) { + assert_eq!(Url::parse(input).unwrap().host(), Some(host)); + } + assert_host("http://www.mozilla.org", Host::Domain("www.mozilla.org")); + assert_host("http://1.35.33.49", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", Host::Ipv6(Ipv6Addr::new( + 0x2001, 0x0db8, 0x85a3, 0x08d3, 0x1319, 0x8a2e, 0x0370, 0x7344))); + assert_host("http://1.35.+33.49", Host::Domain("1.35.+33.49")); + assert_host("http://[::]", Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 0))); + assert_host("http://[::1]", Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))); + assert_host("http://0x1.0X23.0x21.061", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://0x1232131", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://111", Host::Ipv4(Ipv4Addr::new(0, 0, 0, 111))); + assert_host("http://2..2.3", Host::Domain("2..2.3")); + assert!(Url::parse("http://42.0x1232131").is_err()); + assert!(Url::parse("http://192.168.0.257").is_err()); +} + +#[test] +fn host_serialization() { + // libstd’s `Display for Ipv6Addr` serializes 0:0:0:0:0:0:_:_ and 0:0:0:0:0:ffff:_:_ + // using IPv4-like syntax, as suggested in https://tools.ietf.org/html/rfc5952#section-4 + // but https://url.spec.whatwg.org/#concept-ipv6-serializer specifies not to. + + // Not [::0.0.0.2] / [::ffff:0.0.0.2] + assert_eq!(Url::parse("http://[0::2]").unwrap().host_str(), Some("[::2]")); + assert_eq!(Url::parse("http://[0::ffff:0:2]").unwrap().host_str(), Some("[::ffff:0:2]")); } #[test] fn test_idna() { assert!("http://goșu.ro".parse::().is_ok()); - assert_eq!(Url::parse("http://☃.net/").unwrap().domain(), Some("xn--n3h.net")); + assert_eq!(Url::parse("http://☃.net/").unwrap().host(), Some(Host::Domain("xn--n3h.net"))); +} + +#[test] +fn test_serialization() { + let data = [ + ("http://example.com/", "http://example.com/"), + ("http://addslash.com", "http://addslash.com/"), + ("http://@emptyuser.com/", "http://emptyuser.com/"), + ("http://:@emptypass.com/", "http://:@emptypass.com/"), + ("http://user@user.com/", "http://user@user.com/"), + ("http://user:pass@userpass.com/", "http://user:pass@userpass.com/"), + ("http://slashquery.com/path/?q=something", "http://slashquery.com/path/?q=something"), + ("http://noslashquery.com/path?q=something", "http://noslashquery.com/path?q=something") + ]; + for &(input, result) in &data { + let url = Url::parse(input).unwrap(); + assert_eq!(url.as_str(), result); + } } diff --git a/tests/urltestdata.txt b/tests/urltestdata.txt index 88a63c18..29bf4b0c 100644 --- a/tests/urltestdata.txt +++ b/tests/urltestdata.txt @@ -41,20 +41,20 @@ http://f:\s21\s/\sb\s?\sd\s#\se\s /:23 s:http h:example.org p:/:23 :: s:http h:example.org p:/foo/:: ::23 s:http h:example.org p:/foo/::23 -foo:// s:foo p:// +foo:// s:foo p:/ http://a:b@c:29/d s:http u:a pass:b h:c port:29 p:/d http::@c:29 s:http h:example.org p:/foo/:@c:29 -http://&a:foo(b]c@d:2/ s:http u:&a pass:foo(b]c h:d port:2 p:/ -http://::@c@d:2 s:http pass::%40c h:d port:2 p:/ +http://&a:foo(b]c@d:2/ s:http u:&a pass:foo(b%5Dc h:d port:2 p:/ +http://::@c@d:2 s:http pass:%3A%40c h:d port:2 p:/ http://foo.com:b@d/ s:http u:foo.com pass:b h:d p:/ http://foo.com/\\@ s:http h:foo.com p://@ http:\\\\foo.com\\ s:http h:foo.com p:/ http:\\\\a\\b:c\\d@foo.com\\ s:http h:a p:/b:c/d@foo.com/ foo:/ s:foo p:/ foo:/bar.com/ s:foo p:/bar.com/ -foo:///////// s:foo p:///////// -foo://///////bar.com/ s:foo p://///////bar.com/ -foo:////:///// s:foo p:////:///// +foo:///////// s:foo p://///// +foo://///////bar.com/ s:foo p:///////bar.com/ +foo:////:///// s:foo p://:///// c:/foo s:c p:/foo //foo/bar s:http h:foo p:/bar http://foo/path;a??e#f#g s:http h:foo p:/path;a q:??e f:#f#g @@ -113,9 +113,9 @@ file:///home/me s:file p:/home/me /// s:file p:/ ///test s:file p:/test file://test s:file h:test p:/ -file://localhost s:file h:localhost p:/ -file://localhost/ s:file h:localhost p:/ -file://localhost/test s:file h:localhost p:/test +file://localhost s:file p:/ +file://localhost/ s:file p:/ +file://localhost/test s:file p:/test test s:file p:/tmp/mock/test file:test s:file p:/tmp/mock/test @@ -170,7 +170,7 @@ http://%25DOMAIN:foobar@foodomain.com/ s:http u:%25DOMAIN pass:foobar h:foodoma http:\\\\www.google.com\\foo s:http h:www.google.com p:/foo http://foo:80/ s:http h:foo p:/ http://foo:81/ s:http h:foo port:81 p:/ -httpa://foo:80/ s:httpa p://foo:80/ +httpa://foo:80/ s:httpa h:foo port:80 p:/ http://foo:-80/ https://foo:443/ s:https h:foo p:/ https://foo:80/ s:https h:foo port:80 p:/ @@ -310,8 +310,8 @@ http://%25 http://hello%00 # Escaped numbers should be treated like IP addresses if they are. -XFAIL http://%30%78%63%30%2e%30%32%35%30.01 s:http p:/ h:127.0.0.1 -XFAIL http://%30%78%63%30%2e%30%32%35%30.01%2e +http://%30%78%63%30%2e%30%32%35%30.01 s:http p:/ h:192.168.0.1 +http://%30%78%63%30%2e%30%32%35%30.01%2e s:http p:/ h:192.168.0.1 # Invalid escaping should trigger the regular host error handling. http://%3g%78%63%30%2e%30%32%35%30%2E.01 @@ -325,5 +325,5 @@ http://192.168.0.1\shello http://\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11 s:http p:/ h:192.168.0.1 # Broken IP addresses. -XFAIL http://192.168.0.257 +http://192.168.0.257 http://[google.com] diff --git a/tests/wpt.rs b/tests/wpt.rs index 6a32287b..6f9f4021 100644 --- a/tests/wpt.rs +++ b/tests/wpt.rs @@ -12,97 +12,63 @@ extern crate test; extern crate url; use std::char; -use url::{RelativeSchemeData, SchemeData, Url}; +use url::Url; fn run_one(entry: Entry) { - // FIXME: Don’t re-indent to make merging the 1.0 branch easier. - { - let Entry { - input, - base, - scheme: expected_scheme, - username: expected_username, - password: expected_password, - host: expected_host, - port: expected_port, - path: expected_path, - query: expected_query, - fragment: expected_fragment, - expected_failure, - } = entry; - let base = match Url::parse(&base) { - Ok(base) => base, - Err(message) => panic!("Error parsing base {}: {}", base, message) - }; - let url = base.join(&input); - if expected_scheme.is_none() { - if url.is_ok() && !expected_failure { - panic!("Expected a parse error for URL {}", input); - } + let Entry { + input, + base, + scheme: expected_scheme, + username: expected_username, + password: expected_password, + host: expected_host, + port: expected_port, + path: expected_path, + query: expected_query, + fragment: expected_fragment, + expected_failure, + } = entry; + let base = match Url::parse(&base) { + Ok(base) => base, + Err(message) => panic!("Error parsing base {}: {}", base, message) + }; + let expecting_err = expected_scheme.is_none() ^ expected_failure; + let url = match base.join(&input) { + Ok(url) => url, + Err(reason) => { + assert!(expecting_err, "Error parsing URL {}: {}", input, reason); return } - let Url { scheme, scheme_data, query, fragment, .. } = match url { - Ok(url) => url, - Err(message) => { - if expected_failure { - return - } else { - panic!("Error parsing URL {}: {}", input, message) - } - } - }; + }; + assert!(!expecting_err, "Expected a parse error for URL {}", input); - macro_rules! assert_eq { - ($a: expr, $b: expr) => { - { - let a = $a; - let b = $b; - if a != b { - if expected_failure { - return - } else { - panic!("{:?} != {:?}", a, b) - } + macro_rules! assert_eq { + ($a: expr, $b: expr) => { + { + let a = $a; + let b = $b; + if a != b { + if expected_failure { + return + } else { + panic!("{:?} != {:?} for {:?}", a, b, url) } } } } - - assert_eq!(Some(scheme), expected_scheme); - match scheme_data { - SchemeData::Relative(RelativeSchemeData { - username, password, host, port, default_port: _, path, - }) => { - assert_eq!(username, expected_username); - assert_eq!(password, expected_password); - let host = host.serialize(); - assert_eq!(host, expected_host); - assert_eq!(port, expected_port); - assert_eq!(Some(format!("/{}", str_join(&path, "/"))), expected_path); - }, - SchemeData::NonRelative(scheme_data) => { - assert_eq!(Some(scheme_data), expected_path); - assert_eq!(String::new(), expected_username); - assert_eq!(None, expected_password); - assert_eq!(String::new(), expected_host); - assert_eq!(None, expected_port); - }, - } - fn opt_prepend(prefix: &str, opt_s: Option) -> Option { - opt_s.map(|s| format!("{}{}", prefix, s)) - } - assert_eq!(opt_prepend("?", query), expected_query); - assert_eq!(opt_prepend("#", fragment), expected_fragment); - - assert!(!expected_failure, "Unexpected success for {}", input); } -} -// FIMXE: Remove this when &[&str]::join (the new name) lands in the stable channel. -#[allow(deprecated)] -fn str_join>(pieces: &[T], separator: &str) -> String { - pieces.connect(separator) + assert_eq!(Some(url.scheme().to_owned()), expected_scheme); + assert_eq!(url.username(), expected_username); + assert_eq!(url.password().map(|s| s.to_owned()), expected_password); + assert_eq!(url.host_str().unwrap_or("").to_owned(), expected_host); + assert_eq!(url.port(), expected_port); + assert_eq!(Some(url.path().to_owned()), expected_path); + assert_eq!(url.query().map(|s| format!("?{}", s)), expected_query); + assert_eq!(url.fragment().map(|s| format!("#{}", s)), expected_fragment); + + assert!(!expected_failure, "Unexpected success for {}", input); } struct Entry { From 297221880017a9f72f2eaab3ab1a2cd9091e4daa Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 20:51:14 +0100 Subject: [PATCH 07/89] Remove the dependency on uuid. --- Cargo.toml | 1 - src/lib.rs | 50 ++------------------------------------ src/origin.rs | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 49 deletions(-) create mode 100644 src/origin.rs diff --git a/Cargo.toml b/Cargo.toml index 33e23fe8..9258e5bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,5 @@ optional = true [dependencies] idna = { version = "0.1.0", path = "./idna" } -uuid = { version = "0.2", features = ["v4"] } rustc-serialize = "0.3" matches = "0.1" diff --git a/src/lib.rs b/src/lib.rs index 725100f6..fed134f6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -124,7 +124,6 @@ extern crate rustc_serialize; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; extern crate idna; -extern crate uuid; use host::HostInternal; use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode_to}; @@ -134,14 +133,15 @@ use std::hash; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; -use uuid::Uuid; pub use encoding::EncodingOverride; +pub use origin::Origin; pub use parser::ParseError; pub use host::Host; mod encoding; mod host; +mod origin; mod parser; pub mod percent_encoding; pub mod form_urlencoded; @@ -165,31 +165,6 @@ pub struct Url { fragment_start: Option, // Before '#', unlike Position::FragmentStart } -/// Opaque identifier for URLs that have file or other schemes -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct OpaqueOrigin(Uuid); - -#[cfg(feature="heap_size")] -known_heap_size!(0, OpaqueOrigin); - -impl OpaqueOrigin { - /// Creates a new opaque origin with a random UUID. - pub fn new() -> OpaqueOrigin { - OpaqueOrigin(Uuid::new_v4()) - } -} - -/// The origin of the URL -#[derive(PartialEq, Eq, Clone, Debug)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum Origin { - /// A globally unique identifier - UID(OpaqueOrigin), - - /// Consists of the URL's scheme, host and port - Tuple(String, Host, u16) -} - impl Url { /// Parse an absolute URL from a string. #[inline] @@ -443,27 +418,6 @@ impl Url { Err(()) } - /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) - pub fn origin(&self) -> Origin { - let scheme = self.scheme(); - match scheme { - "blob" => { - let result = Url::parse(self.path()); - match result { - Ok(ref url) => url.origin(), - Err(_) => Origin::UID(OpaqueOrigin::new()) - } - }, - "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { - Origin::Tuple(scheme.to_owned(), self.host().unwrap().to_owned(), - self.port_or_default().unwrap()) - }, - // TODO: Figure out what to do if the scheme is a file - "file" => Origin::UID(OpaqueOrigin::new()), - _ => Origin::UID(OpaqueOrigin::new()) - } - } - /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` /// and return a vector of (key, value) pairs. #[inline] diff --git a/src/origin.rs b/src/origin.rs new file mode 100644 index 00000000..10011c42 --- /dev/null +++ b/src/origin.rs @@ -0,0 +1,67 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use Url; +use host::Host; + +impl Url { + /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) + pub fn origin(&self) -> Origin { + let scheme = self.scheme(); + match scheme { + "blob" => { + let result = Url::parse(self.path()); + match result { + Ok(ref url) => url.origin(), + Err(_) => Origin::new_opaque() + } + }, + "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { + Origin::Tuple(scheme.to_owned(), self.host().unwrap().to_owned(), + self.port_or_default().unwrap()) + }, + // TODO: Figure out what to do if the scheme is a file + "file" => Origin::new_opaque(), + _ => Origin::new_opaque() + } + } +} + +/// The origin of an URL +#[derive(PartialEq, Eq, Clone, Debug)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +pub enum Origin { + /// A globally unique identifier + Opaque(OpaqueOrigin), + + /// Consists of the URL's scheme, host and port + Tuple(String, Host, u16) +} + +impl Origin { + /// Creates a new opaque origin that is only equal to itself. + pub fn new_opaque() -> Origin { + Origin::Opaque(OpaqueOrigin(Box::new(0))) + } +} + +/// Opaque identifier for URLs that have file or other schemes +#[derive(Eq, Clone, Debug)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +// `u8` is a dummy non-zero-sized type to force the allocator to return a unique pointer. +// (It returns `std::heap::EMPTY` for zero-sized allocations.) +pub struct OpaqueOrigin(Arc); + +/// Note that `opaque_origin.clone() != opaque_origin`. +impl PartialEq for OpaqueOrigin { + fn eq(&self, other: &Self) -> bool { + let a: *const u8 = &*self.0; + let b: *const u8 = &*other.0; + a == b + } +} From 162e23feb4a18f9f90ff68ac62088f7b692fadda Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 20:56:34 +0100 Subject: [PATCH 08/89] Add URL slicing/indexing by component. --- src/lib.rs | 6 +- src/slicing.rs | 184 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) create mode 100644 src/slicing.rs diff --git a/src/lib.rs b/src/lib.rs index fed134f6..8daceca7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -136,13 +136,17 @@ use std::str; pub use encoding::EncodingOverride; pub use origin::Origin; -pub use parser::ParseError; pub use host::Host; +pub use parser::ParseError; +pub use slicing::Position; mod encoding; mod host; +mod idna_mapping; mod origin; mod parser; +mod slicing; + pub mod percent_encoding; pub mod form_urlencoded; diff --git a/src/slicing.rs b/src/slicing.rs new file mode 100644 index 00000000..c786e4d2 --- /dev/null +++ b/src/slicing.rs @@ -0,0 +1,184 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::ops::{Range, RangeFrom, RangeTo, RangeFull, Index}; +use Url; + +impl Index for Url { + type Output = str; + fn index(&self, _: RangeFull) -> &str { + &self.serialization + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: RangeFrom) -> &str { + &self.serialization[self.index(range.start)..] + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: RangeTo) -> &str { + &self.serialization[..self.index(range.end)] + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: Range) -> &str { + &self.serialization[self.index(range.start)..self.index(range.end)] + } +} + +/// Indicates a position within a URL based on its components. +/// +/// A range of positions can be used for slicing `Url`: +/// +/// ```rust +/// # use url::{Url, Position}; +/// # fn something(some_url: Url) { +/// let serialization: &str = &some_url[..]; +/// let serialization_without_fragment: &str = &some_url[..Position::QueryEnd]; +/// let authority: &str = &some_url[Position::UsernameStart..Position::PortEnd]; +/// let data_url_payload: &str = &some_url[Position::PathStart..Position::QueryEnd]; +/// let scheme_relative: &str = &some_url[Position::UsernameStart..]; +/// # } +/// ``` +/// +/// In a pseudo-grammar (where `[`…`]?` makes a sub-sequence optional), +/// URL components and delimiters that separate them are: +/// +/// ```notrust +/// url = +/// scheme ":" +/// [ "//" [ username [ ":" password ]? "@" ]? host [ ":" port ]? ] +/// path [ "?" query ]? [ "#" fragment ]? +/// ``` +/// +/// When a given component is not present, +/// its "start" and "end" position are the same +/// (so that `&some_url[FooStart..FooEnd]` is the empty string) +/// and component ordering is preserved +/// (so that a missing query "is between" a path and a fragment). +/// +/// The end of a component and the start of the next are either the same or separate +/// by a delimiter. +/// (Not that the initial `/` of a path is considered part of the path here, not a delimiter.) +/// For example, `&url[..FragmentStart]` would include a `#` delimiter (if present in `url`), +/// so `&url[..QueryEnd]` might be desired instead. +/// +/// `SchemeStart` and `FragmentEnd` are always the start and end of the entire URL, +/// so `&url[SchemeStart..X]` is the same as `&url[..X]` +/// and `&url[X..FragmentEnd]` is the same as `&url[X..]`. +pub enum Position { + SchemeStart, + SchemeEnd, + UsernameStart, + UsernameEnd, + PasswordStart, + PasswordEnd, + HostStart, + HostEnd, + PortStart, + PortEnd, + PathStart, + PathEnd, + QueryStart, + QueryEnd, + FragmentStart, + FragmentEnd +} + +impl Url { + #[inline] + fn index(&self, position: Position) -> usize { + match position { + Position::SchemeStart => 0, + + Position::SchemeEnd => self.scheme_end as usize, + + Position::UsernameStart => if self.non_relative { + debug_assert!(self.byte_at(self.scheme_end) == b':'); + debug_assert!(self.scheme_end + ":".len() as u32 == self.username_end); + self.scheme_end as usize + ":".len() + } else { + debug_assert!(self.slice(self.scheme_end..).starts_with("://")); + self.scheme_end as usize + "://".len() + }, + + Position::UsernameEnd => self.username_end as usize, + + Position::PasswordStart => if self.port.is_some() { + debug_assert!(self.has_host()); + debug_assert!(self.byte_at(self.username_end) == b':'); + self.username_end as usize + ":".len() + } else { + debug_assert!(self.username_end == self.host_start); + self.username_end as usize + }, + + Position::PasswordEnd => if self.port.is_some() { + debug_assert!(self.has_host()); + debug_assert!(self.byte_at(self.username_end) == b':'); + debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); + self.host_start as usize - "@".len() + } else { + debug_assert!(self.username_end == self.host_start); + self.host_start as usize + }, + + Position::HostStart => self.host_start as usize, + + Position::HostEnd => self.host_end as usize, + + Position::PortStart => if self.port.is_some() { + debug_assert!(self.byte_at(self.host_end) == b':'); + self.host_end as usize + ":".len() + } else { + self.host_end as usize + }, + + Position::PortEnd => self.path_start as usize, + + Position::PathStart => self.path_start as usize, + + Position::PathEnd => match (self.query_start, self.fragment_start) { + (Some(q), _) => q as usize, + (None, Some(f)) => f as usize, + (None, None) => self.serialization.len(), + }, + + Position::QueryStart => match (self.query_start, self.fragment_start) { + (Some(q), _) => { + debug_assert!(self.byte_at(q) == b'?'); + q as usize + "?".len() + } + (None, Some(f)) => f as usize, + (None, None) => self.serialization.len(), + }, + + Position::QueryEnd => match self.fragment_start { + None => self.serialization.len(), + Some(f) => f as usize, + }, + + Position::FragmentStart => match self.fragment_start { + Some(f) => { + debug_assert!(self.byte_at(f) == b'#'); + f as usize + "#".len() + } + None => self.serialization.len(), + }, + + Position::FragmentEnd => self.serialization.len(), + } + } +} + From fa7048204bce196eed81122336abb27fb0b627ab Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 21:01:25 +0100 Subject: [PATCH 09/89] Add stubs with partial implementation for the WebIDL API. --- src/lib.rs | 2 + src/webidl.rs | 157 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 src/webidl.rs diff --git a/src/lib.rs b/src/lib.rs index 8daceca7..88b04bac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -139,6 +139,7 @@ pub use origin::Origin; pub use host::Host; pub use parser::ParseError; pub use slicing::Position; +pub use webidl::WebIdl; mod encoding; mod host; @@ -146,6 +147,7 @@ mod idna_mapping; mod origin; mod parser; mod slicing; +mod webidl; pub mod percent_encoding; pub mod form_urlencoded; diff --git a/src/webidl.rs b/src/webidl.rs new file mode 100644 index 00000000..e765e473 --- /dev/null +++ b/src/webidl.rs @@ -0,0 +1,157 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use {Url, ParseError}; + +/// https://url.spec.whatwg.org/#api +pub struct WebIdl; + +impl WebIdl { + /// **Not implemented yet** https://url.spec.whatwg.org/#dom-url-domaintoascii + pub fn domain_to_ascii(_domain: &str) -> String { + unimplemented!() // FIXME + } + + /// **Not implemented yet** https://url.spec.whatwg.org/#dom-url-domaintounicode + pub fn domain_to_unicode(_domain: &str) -> String { + unimplemented!() // FIXME + } + + pub fn href(url: &Url) -> &str { + &url.serialization + } + + pub fn set_href(url: &mut Url, value: &str) -> Result<(), ParseError> { + *url = try!(Url::parse(value)); + Ok(()) + } + + /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-origin + pub fn get_origin(_url: &Url) -> String { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-protocol + #[inline] + pub fn get_protocol(url: &Url) -> &str { + debug_assert!(url.byte_at(url.scheme_end) == b':'); + url.slice(..url.scheme_end + 1) + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-protocol + pub fn set_protocol(_url: &mut Url, _new_protocol: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-username + #[inline] + pub fn get_username(url: &Url) -> &str { + url.username() + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-username + pub fn set_username(_url: &mut Url, _new_username: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-password + #[inline] + pub fn get_password(url: &Url) -> &str { + url.password().unwrap_or("") + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-password + pub fn set_password(_url: &mut Url, _new_password: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-host + #[inline] + pub fn get_host(url: &Url) -> &str { + let host = url.slice(url.host_start..url.host_end); + debug_assert!(!host.is_empty() || url.non_relative); + host + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-host + pub fn set_host(_url: &mut Url, _new_host: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-hostname + #[inline] + pub fn get_hostname(url: &Url) -> &str { + url.host_str().unwrap_or("") + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-hostname + pub fn set_hostname(_url: &mut Url, _new_hostname: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-port + #[inline] + pub fn get_port(url: &Url) -> &str { + if url.port.is_some() { + debug_assert!(url.byte_at(url.host_end) == b':'); + url.slice(url.host_end + 1..url.path_start) + } else { + "" + } + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-port + pub fn set_port(_url: &mut Url, _new_port: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-pathname + #[inline] + pub fn get_pathname(url: &Url) -> &str { + url.path() + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-pathname + pub fn set_pathname(_url: &mut Url, _new_pathname: &str) { + unimplemented!() // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-search + pub fn get_search(url: &Url) -> &str { + match (url.query_start, url.fragment_start) { + (None, _) => "", + (Some(query_start), None) => url.slice(query_start..), + (Some(query_start), Some(fragment_start)) => { + url.slice(query_start..fragment_start) + } + } + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-search + pub fn set_search(_url: &mut Url, _new_search: &str) { + unimplemented!() // FIXME + } + + /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-searchparams + pub fn get_search_params(_url: &Url) -> Vec<(String, String)> { + unimplemented!(); // FIXME + } + + /// Getter for https://url.spec.whatwg.org/#dom-url-hash + pub fn get_hash(url: &Url) -> &str { + match url.fragment_start { + Some(start) => url.slice(start..), + None => "", + } + } + + /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-hash + pub fn set_hash(_url: &mut Url, _new_hash: &str) { + unimplemented!() // FIXME + } +} From 46d9fc9a24f75796856677aeda6409cd00ca44bc Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 22:11:31 +0100 Subject: [PATCH 10/89] Shorter Cargo.toml syntax. --- Cargo.toml | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9258e5bc..8f8a5541 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,23 +27,11 @@ query_encoding = ["encoding"] serde_serialization = ["serde"] heap_size = ["heapsize", "heapsize_plugin"] -[dependencies.heapsize] -version = ">=0.1.1, <0.4" -optional = true - -[dependencies.heapsize_plugin] -version = "0.1.0" -optional = true - -[dependencies.encoding] -version = "0.2" -optional = true - -[dependencies.serde] -version = ">=0.6.1, <0.8" -optional = true - [dependencies] idna = { version = "0.1.0", path = "./idna" } +heapsize = {version = ">=0.1.1, <0.4", optional = true} +heapsize_plugin = {version = "0.1.0", optional = true} +encoding = {version = "0.2", optional = true} +serde = {version = ">=0.6.1, <0.8", optional = true} rustc-serialize = "0.3" matches = "0.1" From b6686eb23bb9455d29256da1df54e37bf07d0a47 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 22:13:15 +0100 Subject: [PATCH 11/89] serde_serialization -> serde Optional dependencies *are* Cargo features. --- Cargo.toml | 1 - Makefile | 4 +--- src/lib.rs | 10 +++++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 8f8a5541..85aecef6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,6 @@ rustc-test = "0.1" [features] query_encoding = ["encoding"] -serde_serialization = ["serde"] heap_size = ["heapsize", "heapsize_plugin"] [dependencies] diff --git a/Makefile b/Makefile index e46603be..131c2db2 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,5 @@ test: - cargo test --features query_encoding - cargo test --features serde_serialization - cargo test + cargo test --features "query_encoding serde" [ x$$TRAVIS_RUST_VERSION != xnightly ] || cargo test --features heap_size doc: diff --git a/src/lib.rs b/src/lib.rs index 88b04bac..8bf19491 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -120,7 +120,7 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") extern crate rustc_serialize; #[macro_use] extern crate matches; -#[cfg(feature="serde_serialization")] extern crate serde; +#[cfg(feature="serde")] extern crate serde; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; extern crate idna; @@ -555,8 +555,8 @@ impl rustc_serialize::Decodable for Url { /// Serializes this URL into a `serde` stream. /// -/// This implementation is only available if the `serde_serialization` Cargo feature is enabled. -#[cfg(feature="serde_serialization")] +/// This implementation is only available if the `serde` Cargo feature is enabled. +#[cfg(feature="serde")] impl serde::Serialize for Url { fn serialize(&self, serializer: &mut S) -> Result<(), S::Error> where S: serde::Serializer { format!("{}", self).serialize(serializer) @@ -565,8 +565,8 @@ impl serde::Serialize for Url { /// Deserializes this URL from a `serde` stream. /// -/// This implementation is only available if the `serde_serialization` Cargo feature is enabled. -#[cfg(feature="serde_serialization")] +/// This implementation is only available if the `serde` Cargo feature is enabled. +#[cfg(feature="serde")] impl serde::Deserialize for Url { fn deserialize(deserializer: &mut D) -> Result where D: serde::Deserializer { let string_representation: String = try!(serde::Deserialize::deserialize(deserializer)); From 7517c8d565bb42d0e59df4200ee08173beed9903 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 8 Feb 2016 22:16:18 +0100 Subject: [PATCH 12/89] Make rustc-serialize an optional dependency. --- Cargo.toml | 3 ++- Makefile | 2 +- src/lib.rs | 4 +++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 85aecef6..31804048 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,6 +21,7 @@ harness = false [dev-dependencies] rustc-test = "0.1" +rustc-serialize = "0.3" [features] query_encoding = ["encoding"] @@ -32,5 +33,5 @@ heapsize = {version = ">=0.1.1, <0.4", optional = true} heapsize_plugin = {version = "0.1.0", optional = true} encoding = {version = "0.2", optional = true} serde = {version = ">=0.6.1, <0.8", optional = true} -rustc-serialize = "0.3" +rustc-serialize = {version = "0.3", optional = true} matches = "0.1" diff --git a/Makefile b/Makefile index 131c2db2..f76adfe1 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ test: - cargo test --features "query_encoding serde" + cargo test --features "query_encoding serde rustc-serialize" [ x$$TRAVIS_RUST_VERSION != xnightly ] || cargo test --features heap_size doc: diff --git a/src/lib.rs b/src/lib.rs index 8bf19491..c079be5d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -118,7 +118,7 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") #![cfg_attr(feature="heap_size", feature(plugin, custom_derive))] #![cfg_attr(feature="heap_size", plugin(heapsize_plugin))] -extern crate rustc_serialize; +#[cfg(feature="rustc-serialize")] extern crate rustc_serialize; #[macro_use] extern crate matches; #[cfg(feature="serde")] extern crate serde; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; @@ -538,6 +538,7 @@ impl RangeArg for RangeTo { } } +#[cfg(feature="rustc-serialize")] impl rustc_serialize::Encodable for Url { fn encode(&self, encoder: &mut S) -> Result<(), S::Error> { encoder.emit_str(self.as_str()) @@ -545,6 +546,7 @@ impl rustc_serialize::Encodable for Url { } +#[cfg(feature="rustc-serialize")] impl rustc_serialize::Decodable for Url { fn decode(decoder: &mut D) -> Result { Url::parse(&*try!(decoder.read_str())).map_err(|error| { From 7881aa56f48648c4a61a2cf6222373282635f632 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 9 Feb 2016 17:04:25 +0100 Subject: [PATCH 13/89] Rename *{Start,End} posititons to {Before,After}* --- src/slicing.rs | 86 +++++++++++++++++++++++++------------------------- 1 file changed, 43 insertions(+), 43 deletions(-) diff --git a/src/slicing.rs b/src/slicing.rs index c786e4d2..63697942 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -45,10 +45,10 @@ impl Index> for Url { /// # use url::{Url, Position}; /// # fn something(some_url: Url) { /// let serialization: &str = &some_url[..]; -/// let serialization_without_fragment: &str = &some_url[..Position::QueryEnd]; -/// let authority: &str = &some_url[Position::UsernameStart..Position::PortEnd]; -/// let data_url_payload: &str = &some_url[Position::PathStart..Position::QueryEnd]; -/// let scheme_relative: &str = &some_url[Position::UsernameStart..]; +/// let serialization_without_fragment: &str = &some_url[..Position::AfterQuery]; +/// let authority: &str = &some_url[Position::BeforeUsername..Position::AfterPort]; +/// let data_url_payload: &str = &some_url[Position::BeforePath..Position::AfterQuery]; +/// let scheme_relative: &str = &some_url[Position::BeforeUsername..]; /// # } /// ``` /// @@ -63,48 +63,48 @@ impl Index> for Url { /// ``` /// /// When a given component is not present, -/// its "start" and "end" position are the same -/// (so that `&some_url[FooStart..FooEnd]` is the empty string) +/// its "before" and "after" position are the same +/// (so that `&some_url[BeforeFoo..AfterFoo]` is the empty string) /// and component ordering is preserved /// (so that a missing query "is between" a path and a fragment). /// /// The end of a component and the start of the next are either the same or separate /// by a delimiter. /// (Not that the initial `/` of a path is considered part of the path here, not a delimiter.) -/// For example, `&url[..FragmentStart]` would include a `#` delimiter (if present in `url`), -/// so `&url[..QueryEnd]` might be desired instead. +/// For example, `&url[..BeforeFragment]` would include a `#` delimiter (if present in `url`), +/// so `&url[..AfterQuery]` might be desired instead. /// -/// `SchemeStart` and `FragmentEnd` are always the start and end of the entire URL, -/// so `&url[SchemeStart..X]` is the same as `&url[..X]` -/// and `&url[X..FragmentEnd]` is the same as `&url[X..]`. +/// `BeforeScheme` and `AfterFragment` are always the start and end of the entire URL, +/// so `&url[BeforeScheme..X]` is the same as `&url[..X]` +/// and `&url[X..AfterFragment]` is the same as `&url[X..]`. pub enum Position { - SchemeStart, - SchemeEnd, - UsernameStart, - UsernameEnd, - PasswordStart, - PasswordEnd, - HostStart, - HostEnd, - PortStart, - PortEnd, - PathStart, - PathEnd, - QueryStart, - QueryEnd, - FragmentStart, - FragmentEnd + BeforeScheme, + AfterScheme, + BeforeUsername, + AfterUsername, + BeforePassword, + AfterPassword, + BeforeHost, + AfterHost, + BeforePort, + AfterPort, + BeforePath, + AfterPath, + BeforeQuery, + AfterQuery, + BeforeFragment, + AfterFragment } impl Url { #[inline] fn index(&self, position: Position) -> usize { match position { - Position::SchemeStart => 0, + Position::BeforeScheme => 0, - Position::SchemeEnd => self.scheme_end as usize, + Position::AfterScheme => self.scheme_end as usize, - Position::UsernameStart => if self.non_relative { + Position::BeforeUsername => if self.non_relative { debug_assert!(self.byte_at(self.scheme_end) == b':'); debug_assert!(self.scheme_end + ":".len() as u32 == self.username_end); self.scheme_end as usize + ":".len() @@ -113,9 +113,9 @@ impl Url { self.scheme_end as usize + "://".len() }, - Position::UsernameEnd => self.username_end as usize, + Position::AfterUsername => self.username_end as usize, - Position::PasswordStart => if self.port.is_some() { + Position::BeforePassword => if self.port.is_some() { debug_assert!(self.has_host()); debug_assert!(self.byte_at(self.username_end) == b':'); self.username_end as usize + ":".len() @@ -124,7 +124,7 @@ impl Url { self.username_end as usize }, - Position::PasswordEnd => if self.port.is_some() { + Position::AfterPassword => if self.port.is_some() { debug_assert!(self.has_host()); debug_assert!(self.byte_at(self.username_end) == b':'); debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); @@ -134,28 +134,28 @@ impl Url { self.host_start as usize }, - Position::HostStart => self.host_start as usize, + Position::BeforeHost => self.host_start as usize, - Position::HostEnd => self.host_end as usize, + Position::AfterHost => self.host_end as usize, - Position::PortStart => if self.port.is_some() { + Position::BeforePort => if self.port.is_some() { debug_assert!(self.byte_at(self.host_end) == b':'); self.host_end as usize + ":".len() } else { self.host_end as usize }, - Position::PortEnd => self.path_start as usize, + Position::AfterPort => self.path_start as usize, - Position::PathStart => self.path_start as usize, + Position::BeforePath => self.path_start as usize, - Position::PathEnd => match (self.query_start, self.fragment_start) { + Position::AfterPath => match (self.query_start, self.fragment_start) { (Some(q), _) => q as usize, (None, Some(f)) => f as usize, (None, None) => self.serialization.len(), }, - Position::QueryStart => match (self.query_start, self.fragment_start) { + Position::BeforeQuery => match (self.query_start, self.fragment_start) { (Some(q), _) => { debug_assert!(self.byte_at(q) == b'?'); q as usize + "?".len() @@ -164,12 +164,12 @@ impl Url { (None, None) => self.serialization.len(), }, - Position::QueryEnd => match self.fragment_start { + Position::AfterQuery => match self.fragment_start { None => self.serialization.len(), Some(f) => f as usize, }, - Position::FragmentStart => match self.fragment_start { + Position::BeforeFragment => match self.fragment_start { Some(f) => { debug_assert!(self.byte_at(f) == b'#'); f as usize + "#".len() @@ -177,7 +177,7 @@ impl Url { None => self.serialization.len(), }, - Position::FragmentEnd => self.serialization.len(), + Position::AfterFragment => self.serialization.len(), } } } From a880bd398567503a907224f0690a7060cb2fdb4b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 9 Feb 2016 17:18:53 +0100 Subject: [PATCH 14/89] Replace from_hex() with char::to_digit(16) --- src/host.rs | 4 ++-- src/percent_encoding.rs | 34 ++++------------------------------ 2 files changed, 6 insertions(+), 32 deletions(-) diff --git a/src/host.rs b/src/host.rs index a1b1f2af..ceee9e71 100644 --- a/src/host.rs +++ b/src/host.rs @@ -10,7 +10,7 @@ use std::cmp; use std::fmt::{self, Formatter, Write}; use std::net::{Ipv4Addr, Ipv6Addr}; use parser::{ParseResult, ParseError}; -use percent_encoding::{from_hex, percent_decode}; +use percent_encoding::percent_decode; use idna; #[derive(Copy, Clone, Debug)] @@ -259,7 +259,7 @@ fn parse_ipv6addr(input: &str) -> ParseResult { let end = cmp::min(len, start + 4); let mut value = 0u16; while i < end { - match from_hex(input[i]) { + match (input[i] as char).to_digit(16) { Some(digit) => { value = value * 0x10 + digit as u16; i += 1; diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 5805b10d..2fbe0ad3 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -169,8 +169,10 @@ pub fn percent_decode_to(input: &[u8], output: &mut Vec) { while i < input.len() { let c = input[i]; if c == b'%' && i + 2 < input.len() { - if let (Some(h), Some(l)) = (from_hex(input[i + 1]), from_hex(input[i + 2])) { - output.push(h * 0x10 + l); + let h = (input[i + 1] as char).to_digit(16); + let l = (input[i + 2] as char).to_digit(16); + if let (Some(h), Some(l)) = (h, l) { + output.push(h as u8 * 0x10 + l as u8); i += 3; continue } @@ -199,31 +201,3 @@ pub fn percent_decode(input: &[u8]) -> Vec { pub fn lossy_utf8_percent_decode(input: &[u8]) -> String { String::from_utf8_lossy(&percent_decode(input)).to_string() } - -/// Convert the given hex character into its numeric value. -/// -/// # Examples -/// -/// ``` -/// use url::percent_encoding::from_hex; -/// assert_eq!(from_hex('0' as u8), Some(0)); -/// assert_eq!(from_hex('1' as u8), Some(1)); -/// assert_eq!(from_hex('9' as u8), Some(9)); -/// assert_eq!(from_hex('A' as u8), Some(10)); -/// assert_eq!(from_hex('a' as u8), Some(10)); -/// assert_eq!(from_hex('F' as u8), Some(15)); -/// assert_eq!(from_hex('f' as u8), Some(15)); -/// assert_eq!(from_hex('G' as u8), None); -/// assert_eq!(from_hex('g' as u8), None); -/// assert_eq!(from_hex('Z' as u8), None); -/// assert_eq!(from_hex('z' as u8), None); -/// ``` -#[inline] -pub fn from_hex(byte: u8) -> Option { - match byte { - b'0' ... b'9' => Some(byte - b'0'), // 0..9 - b'A' ... b'F' => Some(byte + 10 - b'A'), // A..F - b'a' ... b'f' => Some(byte + 10 - b'a'), // a..f - _ => None - } -} From 6db8b84f958ed64c36c4d50bae9468d72af4406b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 9 Feb 2016 17:39:47 +0100 Subject: [PATCH 15/89] Make percent-decoding an iterator. --- src/form_urlencoded.rs | 4 +-- src/host.rs | 5 ++-- src/lib.rs | 10 ++----- src/percent_encoding.rs | 66 +++++++++++++++++++++++++---------------- 4 files changed, 47 insertions(+), 38 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 9af1cc34..9e19b692 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -84,8 +84,8 @@ fn parse_internal(input: &[u8], mut encoding_override: EncodingOverride, mut use } Some(pairs.into_iter().map(|(name, value)| ( - encoding_override.decode(&percent_decode(&name)), - encoding_override.decode(&percent_decode(&value)) + encoding_override.decode(&percent_decode(&name).collect::>()), + encoding_override.decode(&percent_decode(&value).collect::>()), )).collect()) } diff --git a/src/host.rs b/src/host.rs index ceee9e71..fce48bd3 100644 --- a/src/host.rs +++ b/src/host.rs @@ -10,7 +10,7 @@ use std::cmp; use std::fmt::{self, Formatter, Write}; use std::net::{Ipv4Addr, Ipv6Addr}; use parser::{ParseResult, ParseError}; -use percent_encoding::percent_decode; +use percent_encoding::lossy_utf8_percent_decode; use idna; #[derive(Copy, Clone, Debug)] @@ -64,8 +64,7 @@ impl Host { } return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6) } - let decoded = percent_decode(input.as_bytes()); - let domain = String::from_utf8_lossy(&decoded); + let domain = lossy_utf8_percent_decode(input.as_bytes()); let domain = try!(idna::domain_to_ascii(&domain)); if domain.find(|c| matches!(c, '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']' diff --git a/src/lib.rs b/src/lib.rs index c079be5d..4ca8823c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -126,7 +126,7 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") extern crate idna; use host::HostInternal; -use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode_to}; +use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode_to, percent_decode}; use std::cmp; use std::fmt; use std::hash; @@ -635,12 +635,10 @@ fn file_url_segments_to_pathbuf(segments: str::Split) -> Result) -> Result) -> Result { - use percent_encoding::percent_decode; - let first = try!(segments.next().ok_or(())); if first.len() != 2 || !first.starts_with(parser::ascii_alpha) || first.as_bytes()[1] != b':' { @@ -669,7 +665,7 @@ fn file_url_segments_to_pathbuf_windows(mut segments: str::Split) -> Resul string.push('\\'); // Currently non-unicode windows paths cannot be represented - match String::from_utf8(percent_decode(segment.as_bytes())) { + match String::from_utf8(percent_decode(segment.as_bytes()).collect()) { Ok(s) => string.push_str(&s), Err(..) => return Err(()), } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 2fbe0ad3..807cf957 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -7,7 +7,9 @@ // except according to those terms. use std::ascii::AsciiExt; +use std::borrow::Cow; use std::fmt::Write; +use std::slice; /// Represents a set of characters / bytes that should be percent-encoded. /// @@ -163,41 +165,53 @@ pub fn utf8_percent_encode(input: &str, encode_set: E) -> String { } -/// Percent-decode the given bytes, and push the result to `output`. -pub fn percent_decode_to(input: &[u8], output: &mut Vec) { - let mut i = 0; - while i < input.len() { - let c = input[i]; - if c == b'%' && i + 2 < input.len() { - let h = (input[i + 1] as char).to_digit(16); - let l = (input[i + 2] as char).to_digit(16); - if let (Some(h), Some(l)) = (h, l) { - output.push(h as u8 * 0x10 + l as u8); - i += 3; - continue - } - } - - output.push(c); - i += 1; +/// Percent-decode the given bytes and return an iterator of bytes. +#[inline] +pub fn percent_decode(input: &[u8]) -> PercentDecode { + PercentDecode { + iter: input.iter() } } - -/// Percent-decode the given bytes. -#[inline] -pub fn percent_decode(input: &[u8]) -> Vec { - let mut output = Vec::new(); - percent_decode_to(input, &mut output); - output +pub struct PercentDecode<'a> { + iter: slice::Iter<'a, u8>, } +impl<'a> Iterator for PercentDecode<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + self.iter.next().map(|&byte| { + if byte == b'%' { + let after_percent_sign = self.iter.clone(); + let h = self.iter.next().and_then(|&b| (b as char).to_digit(16)); + let l = self.iter.next().and_then(|&b| (b as char).to_digit(16)); + if let (Some(h), Some(l)) = (h, l) { + return h as u8 * 0x10 + l as u8 + } + self.iter = after_percent_sign; + } + byte + }) + } + + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + (low, high.and_then(|high| high.checked_mul(3))) + } +} /// Percent-decode the given bytes, and decode the result as UTF-8. /// /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences /// will be replaced � U+FFFD, the replacement character. -#[inline] pub fn lossy_utf8_percent_decode(input: &[u8]) -> String { - String::from_utf8_lossy(&percent_decode(input)).to_string() + let bytes = percent_decode(input).collect::>(); + match String::from_utf8_lossy(&bytes) { + Cow::Owned(s) => return s, + Cow::Borrowed(_) => {} + } + unsafe { + String::from_utf8_unchecked(bytes) + } } From a3210b9b909c9a54be508a4137ac0a8beace0a94 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 9 Feb 2016 18:24:28 +0100 Subject: [PATCH 16/89] Make percent-encoding an iterator. --- src/form_urlencoded.rs | 4 +- src/lib.rs | 7 ++-- src/parser.rs | 18 ++++---- src/percent_encoding.rs | 91 ++++++++++++++++++++++++----------------- 4 files changed, 68 insertions(+), 52 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 9e19b692..68f967d6 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -16,7 +16,7 @@ use std::borrow::Borrow; use std::ascii::AsciiExt; use encoding::EncodingOverride; -use percent_encoding::{percent_encode_to, percent_decode, FORM_URLENCODED_ENCODE_SET}; +use percent_encoding::{percent_encode, percent_decode, FORM_URLENCODED_ENCODE_SET}; /// Convert a byte string in the `application/x-www-form-urlencoded` format @@ -125,7 +125,7 @@ where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { if byte == b' ' { output.push_str("+") } else { - percent_encode_to(&[byte], FORM_URLENCODED_ENCODE_SET, output) + output.extend(percent_encode(&[byte], FORM_URLENCODED_ENCODE_SET)) } } } diff --git a/src/lib.rs b/src/lib.rs index 4ca8823c..10981bd3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -126,7 +126,7 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") extern crate idna; use host::HostInternal; -use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode_to, percent_decode}; +use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; use std::cmp; use std::fmt; use std::hash; @@ -585,7 +585,8 @@ fn path_to_file_url_segments(path: &Path, serialization: &mut String) -> Result< // skip the root component for component in path.components().skip(1) { serialization.push('/'); - percent_encode_to(component.as_os_str().as_bytes(), PATH_SEGMENT_ENCODE_SET, serialization) + serialization.extend(percent_encode( + component.as_os_str().as_bytes(), PATH_SEGMENT_ENCODE_SET)) } Ok(()) } @@ -624,7 +625,7 @@ fn path_to_file_url_segments_windows(path: &Path, serialization: &mut String) -> // FIXME: somehow work with non-unicode? let component = try!(component.as_os_str().to_str().ok_or(())); serialization.push('/'); - percent_encode_to(component.as_bytes(), PATH_SEGMENT_ENCODE_SET, serialization); + serialization.extend(percent_encode(component.as_bytes(), PATH_SEGMENT_ENCODE_SET)); } Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index 9e035bcd..d7d3ab0d 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -13,7 +13,7 @@ use std::fmt::{self, Formatter, Write}; use super::{Url, EncodingOverride}; use host::{self, HostInternal}; use percent_encoding::{ - utf8_percent_encode_to, percent_encode_to, + utf8_percent_encode, percent_encode, SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET }; @@ -608,7 +608,7 @@ impl<'a> Parser<'a> { _ => { self.check_url_code_point(input, i, c); let utf8_c = &input[i..next_i]; - utf8_percent_encode_to(utf8_c, USERINFO_ENCODE_SET, &mut self.serialization); + self.serialization.extend(utf8_percent_encode(utf8_c, USERINFO_ENCODE_SET)); } } } @@ -798,8 +798,8 @@ impl<'a> Parser<'a> { '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), _ => { self.check_url_code_point(input, i, c); - utf8_percent_encode_to( - &input[i..next_i], DEFAULT_ENCODE_SET, &mut self.serialization); + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], DEFAULT_ENCODE_SET)); } } } @@ -865,8 +865,8 @@ impl<'a> Parser<'a> { '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), _ => { self.check_url_code_point(input, i, c); - utf8_percent_encode_to( - &input[i..next_i], SIMPLE_ENCODE_SET, &mut self.serialization); + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], SIMPLE_ENCODE_SET)); } } } @@ -945,7 +945,7 @@ impl<'a> Parser<'a> { _ => EncodingOverride::utf8(), }; let query_bytes = encoding.encode(&query); - percent_encode_to(&query_bytes, QUERY_ENCODE_SET, &mut self.serialization); + self.serialization.extend(percent_encode(&query_bytes, QUERY_ENCODE_SET)); remaining } @@ -973,8 +973,8 @@ impl<'a> Parser<'a> { '\0' | '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), _ => { self.check_url_code_point(input, i, c); - utf8_percent_encode_to( - &input[i..next_i], SIMPLE_ENCODE_SET, &mut self.serialization); + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], SIMPLE_ENCODE_SET)); } } } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 807cf957..21684d9a 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -8,7 +8,6 @@ use std::ascii::AsciiExt; use std::borrow::Cow; -use std::fmt::Write; use std::slice; /// Represents a set of characters / bytes that should be percent-encoded. @@ -49,7 +48,7 @@ pub trait EncodeSet { /// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} /// } /// # fn main() { -/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET), "foo%20bar"); +/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::(), "foo%20bar"); /// # } /// ``` #[macro_export] @@ -116,54 +115,70 @@ define_encode_set! { } } -/// Percent-encode the given bytes, and push the result to `output`. -/// -/// The pushed strings are within the ASCII range. +/// Percent-encode the given bytes and return an iterator of `char` in the ASCII range. #[inline] -pub fn percent_encode_to(input: &[u8], encode_set: E, output: &mut String) { - for &byte in input { - if encode_set.contains(byte) { - write!(output, "%{:02X}", byte).unwrap(); - } else { - assert!(byte.is_ascii()); - unsafe { - output.as_mut_vec().push(byte) - } - } +pub fn percent_encode(input: &[u8], encode_set: E) -> PercentEncode { + PercentEncode { + iter: input.iter(), + encode_set: encode_set, + state: PercentEncodeState::NextByte, } } - -/// Percent-encode the given bytes. -/// -/// The returned string is within the ASCII range. +/// Percent-encode the UTF-8 encoding of the given string +/// and return an iterator of `char` in the ASCII range. #[inline] -pub fn percent_encode(input: &[u8], encode_set: E) -> String { - let mut output = String::new(); - percent_encode_to(input, encode_set, &mut output); - output +pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentEncode { + percent_encode(input.as_bytes(), encode_set) } +pub struct PercentEncode<'a, E: EncodeSet> { + iter: slice::Iter<'a, u8>, + encode_set: E, + state: PercentEncodeState, +} -/// Percent-encode the UTF-8 encoding of the given string, and push the result to `output`. -/// -/// The pushed strings are within the ASCII range. -#[inline] -pub fn utf8_percent_encode_to(input: &str, encode_set: E, output: &mut String) { - percent_encode_to(input.as_bytes(), encode_set, output) +enum PercentEncodeState { + NextByte, + HexHigh(u8), + HexLow(u8), } +impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { + type Item = char; -/// Percent-encode the UTF-8 encoding of the given string. -/// -/// The returned string is within the ASCII range. -#[inline] -pub fn utf8_percent_encode(input: &str, encode_set: E) -> String { - let mut output = String::new(); - utf8_percent_encode_to(input, encode_set, &mut output); - output -} + fn next(&mut self) -> Option { + // str::char::from_digit always returns lowercase. + const UPPER_HEX: [char; 16] = ['0', '1', '2', '3', '4', '5', '6', '7', + '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']; + match self.state { + PercentEncodeState::HexHigh(byte) => { + self.state = PercentEncodeState::HexLow(byte); + Some(UPPER_HEX[(byte >> 4) as usize]) + } + PercentEncodeState::HexLow(byte) => { + self.state = PercentEncodeState::NextByte; + Some(UPPER_HEX[(byte & 0x0F) as usize]) + } + PercentEncodeState::NextByte => { + self.iter.next().map(|&byte| { + if self.encode_set.contains(byte) { + self.state = PercentEncodeState::HexHigh(byte); + '%' + } else { + assert!(byte.is_ascii()); + byte as char + } + }) + } + } + } + fn size_hint(&self) -> (usize, Option) { + let (low, high) = self.iter.size_hint(); + (low.saturating_add(2) / 3, high) + } +} /// Percent-decode the given bytes and return an iterator of bytes. #[inline] From 6fadafa1a983da4f90a97f306b7f6f9a23b08bb9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 9 Feb 2016 19:27:07 +0100 Subject: [PATCH 17/89] Add percent-encoding convienience wrappers. --- src/percent_encoding.rs | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 21684d9a..c3ebc34a 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -8,6 +8,7 @@ use std::ascii::AsciiExt; use std::borrow::Cow; +use std::fmt::{self, Write}; use std::slice; /// Represents a set of characters / bytes that should be percent-encoded. @@ -23,7 +24,7 @@ use std::slice; /// /// A few sets are defined in this module. /// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. -pub trait EncodeSet { +pub trait EncodeSet: Clone { /// Called with UTF-8 bytes rather than code points. /// Should return false for all non-ASCII bytes. fn contains(&self, byte: u8) -> bool; @@ -132,12 +133,14 @@ pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentE percent_encode(input.as_bytes(), encode_set) } +#[derive(Clone)] pub struct PercentEncode<'a, E: EncodeSet> { iter: slice::Iter<'a, u8>, encode_set: E, state: PercentEncodeState, } +#[derive(Clone)] enum PercentEncodeState { NextByte, HexHigh(u8), @@ -180,6 +183,15 @@ impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { } } +impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + for c in (*self).clone() { + try!(formatter.write_char(c)) + } + Ok(()) + } +} + /// Percent-decode the given bytes and return an iterator of bytes. #[inline] pub fn percent_decode(input: &[u8]) -> PercentDecode { @@ -188,6 +200,7 @@ pub fn percent_decode(input: &[u8]) -> PercentDecode { } } +#[derive(Clone)] pub struct PercentDecode<'a> { iter: slice::Iter<'a, u8>, } @@ -216,6 +229,14 @@ impl<'a> Iterator for PercentDecode<'a> { } } +/// Percent-decode the given bytes, and decode the result as UTF-8. +/// +/// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8. +pub fn utf8_percent_decode(input: &[u8]) -> Result { + let bytes = percent_decode(input).collect::>(); + String::from_utf8(bytes) +} + /// Percent-decode the given bytes, and decode the result as UTF-8. /// /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences From 1c9fb2f104068651c1616c1bf861d82a9fb3e6eb Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 10 Feb 2016 12:59:20 +0100 Subject: [PATCH 18/89] Update tests from https://github.com/w3c/web-platform-tests/blob/master/url/ Parser changes correspond to spec changes. --- src/lib.rs | 5 +- src/parser.rs | 20 +- src/webidl.rs | 51 +- tests/urltestdata.json | 4236 ++++++++++++++++++++++++++++++++++++++++ tests/urltestdata.txt | 329 ---- tests/wpt.rs | 226 +-- 6 files changed, 4356 insertions(+), 511 deletions(-) create mode 100644 tests/urltestdata.json delete mode 100644 tests/urltestdata.txt diff --git a/src/lib.rs b/src/lib.rs index 10981bd3..bbbd2cb1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -340,7 +340,10 @@ impl Url { } } - /// Return this URL’s fragment identifier, if any, as a percent-encoded ASCII string. + /// Return this URL’s fragment identifier, if any. + /// + /// **Note:** the parser does *not* percent-encode this component, + /// but the input may be percent-encoded already. pub fn fragment(&self) -> Option<&str> { self.fragment_start.map(|start| { debug_assert!(self.byte_at(start) == b'#'); diff --git a/src/parser.rs b/src/parser.rs index d7d3ab0d..c7fef178 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -798,14 +798,22 @@ impl<'a> Parser<'a> { '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), _ => { self.check_url_code_point(input, i, c); + if c == '%' { + let after_percent_sign = iter.clone(); + if matches!(iter.next(), Some((_, '2', _))) && + matches!(iter.next(), Some((_, 'E', _)) | Some((_, 'e', _))) { + self.serialization.push('.'); + continue + } + iter = after_percent_sign + } self.serialization.extend(utf8_percent_encode( &input[i..next_i], DEFAULT_ENCODE_SET)); } } } match &self.serialization[segment_start..] { - ".." | ".%2e" | ".%2E" | "%2e." | "%2E." | - "%2e%2e" | "%2E%2e" | "%2e%2E" | "%2E%2E" => { + ".." => { debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/'); self.serialization.truncate(segment_start - 1); // Truncate "/.." self.pop_path(scheme_type, path_start); @@ -813,7 +821,7 @@ impl<'a> Parser<'a> { self.serialization.push('/') } }, - "." | "%2e" | "%2E" => { + "." => { self.serialization.truncate(segment_start); }, _ => { @@ -968,13 +976,12 @@ impl<'a> Parser<'a> { } pub fn parse_fragment(&mut self, input: &str) { - for (i, c, next_i) in input.char_ranges() { + for (i, c) in input.char_indices() { match c { '\0' | '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), _ => { self.check_url_code_point(input, i, c); - self.serialization.extend(utf8_percent_encode( - &input[i..next_i], SIMPLE_ENCODE_SET)); + self.serialization.push(c); // No percent-encoding here. } } } @@ -1043,6 +1050,7 @@ impl<'a> StrCharRanges<'a> for &'a str { } } +#[derive(Clone)] pub struct CharRanges<'a> { slice: &'a str, position: usize, diff --git a/src/webidl.rs b/src/webidl.rs index e765e473..9361538d 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -32,13 +32,13 @@ impl WebIdl { } /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-origin - pub fn get_origin(_url: &Url) -> String { + pub fn origin(_url: &Url) -> String { unimplemented!() // FIXME } /// Getter for https://url.spec.whatwg.org/#dom-url-protocol #[inline] - pub fn get_protocol(url: &Url) -> &str { + pub fn protocol(url: &Url) -> &str { debug_assert!(url.byte_at(url.scheme_end) == b':'); url.slice(..url.scheme_end + 1) } @@ -50,7 +50,7 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-username #[inline] - pub fn get_username(url: &Url) -> &str { + pub fn username(url: &Url) -> &str { url.username() } @@ -61,7 +61,7 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-password #[inline] - pub fn get_password(url: &Url) -> &str { + pub fn password(url: &Url) -> &str { url.password().unwrap_or("") } @@ -72,9 +72,8 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-host #[inline] - pub fn get_host(url: &Url) -> &str { - let host = url.slice(url.host_start..url.host_end); - debug_assert!(!host.is_empty() || url.non_relative); + pub fn host(url: &Url) -> &str { + let host = url.slice(url.host_start..url.path_start); host } @@ -85,7 +84,7 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-hostname #[inline] - pub fn get_hostname(url: &Url) -> &str { + pub fn hostname(url: &Url) -> &str { url.host_str().unwrap_or("") } @@ -96,7 +95,7 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-port #[inline] - pub fn get_port(url: &Url) -> &str { + pub fn port(url: &Url) -> &str { if url.port.is_some() { debug_assert!(url.byte_at(url.host_end) == b':'); url.slice(url.host_end + 1..url.path_start) @@ -112,7 +111,7 @@ impl WebIdl { /// Getter for https://url.spec.whatwg.org/#dom-url-pathname #[inline] - pub fn get_pathname(url: &Url) -> &str { + pub fn pathname(url: &Url) -> &str { url.path() } @@ -122,13 +121,21 @@ impl WebIdl { } /// Getter for https://url.spec.whatwg.org/#dom-url-search - pub fn get_search(url: &Url) -> &str { + pub fn search(url: &Url) -> &str { match (url.query_start, url.fragment_start) { - (None, _) => "", - (Some(query_start), None) => url.slice(query_start..), - (Some(query_start), Some(fragment_start)) => { - url.slice(query_start..fragment_start) - } + (Some(query_start), None) if { + debug_assert!(url.byte_at(query_start) == b'?'); + // If the query (after ?) is not empty + (query_start as usize) < url.serialization.len() - 1 + } => url.slice(query_start..), + + (Some(query_start), Some(fragment_start)) if { + debug_assert!(url.byte_at(query_start) == b'?'); + // If the fragment (after ?) is not empty + query_start < fragment_start + } => url.slice(query_start..fragment_start), + + _ => "", } } @@ -138,15 +145,19 @@ impl WebIdl { } /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-searchparams - pub fn get_search_params(_url: &Url) -> Vec<(String, String)> { + pub fn search_params(_url: &Url) -> Vec<(String, String)> { unimplemented!(); // FIXME } /// Getter for https://url.spec.whatwg.org/#dom-url-hash - pub fn get_hash(url: &Url) -> &str { + pub fn hash(url: &Url) -> &str { match url.fragment_start { - Some(start) => url.slice(start..), - None => "", + Some(start) if { + debug_assert!(url.byte_at(start) == b'#'); + // If the fragment (after #) is not empty + (start as usize) < url.serialization.len() - 1 + } => url.slice(start..), + _ => "", } } diff --git a/tests/urltestdata.json b/tests/urltestdata.json new file mode 100644 index 00000000..4ea27d73 --- /dev/null +++ b/tests/urltestdata.json @@ -0,0 +1,4236 @@ +[ + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/segments.js", + { + "input": "http://example\t.\norg", + "base": "http://example.org/foo/bar", + "href": "http://example.org/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://user:pass@foo:21/bar;par?b#c", + "base": "http://example.org/foo/bar", + "href": "http://user:pass@foo:21/bar;par?b#c", + "origin": "http://foo:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "foo:21", + "hostname": "foo", + "port": "21", + "pathname": "/bar;par", + "search": "?b", + "hash": "#c" + }, + { + "input": "http:foo.com", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/foo.com", + "search": "", + "hash": "" + }, + { + "input": "\t :foo.com \n", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com", + "search": "", + "hash": "" + }, + { + "input": " foo.com ", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/foo.com", + "search": "", + "hash": "" + }, + { + "input": "a:\t foo.com", + "base": "http://example.org/foo/bar", + "href": "a: foo.com", + "origin": "null", + "protocol": "a:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": " foo.com", + "search": "", + "hash": "" + }, + { + "input": "http://f:21/ b ? d # e ", + "base": "http://example.org/foo/bar", + "href": "http://f:21/%20b%20?%20d%20# e", + "origin": "http://f:21", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:21", + "hostname": "f", + "port": "21", + "pathname": "/%20b%20", + "search": "?%20d%20", + "hash": "# e" + }, + { + "input": "http://f:/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:0/c", + "base": "http://example.org/foo/bar", + "href": "http://f:0/c", + "origin": "http://f:0", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:0", + "hostname": "f", + "port": "0", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:00000000000000/c", + "base": "http://example.org/foo/bar", + "href": "http://f:0/c", + "origin": "http://f:0", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:0", + "hostname": "f", + "port": "0", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:00000000000000000000080/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:b/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f: /c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f:\n/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:fifty-two/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f:999999/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f: 21 / b ? d # e ", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": " \t", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": ":foo.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com/", + "search": "", + "hash": "" + }, + { + "input": ":foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com/", + "search": "", + "hash": "" + }, + { + "input": ":", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:", + "search": "", + "hash": "" + }, + { + "input": ":a", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:a", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:a", + "search": "", + "hash": "" + }, + { + "input": ":/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:/", + "search": "", + "hash": "" + }, + { + "input": ":\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:/", + "search": "", + "hash": "" + }, + { + "input": ":#", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:#", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:", + "search": "", + "hash": "" + }, + { + "input": "#", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "#/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#/" + }, + { + "input": "#\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#\\", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#\\" + }, + { + "input": "#;?", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#;?", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#;?" + }, + { + "input": "?", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar?", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": ":23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:23", + "search": "", + "hash": "" + }, + { + "input": "/:23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/:23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/:23", + "search": "", + "hash": "" + }, + { + "input": "::", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/::", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/::", + "search": "", + "hash": "" + }, + { + "input": "::23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/::23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/::23", + "search": "", + "hash": "" + }, + { + "input": "foo://", + "base": "http://example.org/foo/bar", + "href": "foo:///", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:b@c:29/d", + "base": "http://example.org/foo/bar", + "href": "http://a:b@c:29/d", + "origin": "http://c:29", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "c:29", + "hostname": "c", + "port": "29", + "pathname": "/d", + "search": "", + "hash": "" + }, + { + "input": "http::@c:29", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:@c:29", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:@c:29", + "search": "", + "hash": "" + }, + { + "input": "http://&a:foo(b]c@d:2/", + "base": "http://example.org/foo/bar", + "href": "http://&a:foo(b%5Dc@d:2/", + "origin": "http://d:2", + "protocol": "http:", + "username": "&a", + "password": "foo(b%5Dc", + "host": "d:2", + "hostname": "d", + "port": "2", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://::@c@d:2", + "base": "http://example.org/foo/bar", + "href": "http://:%3A%40c@d:2/", + "origin": "http://d:2", + "protocol": "http:", + "username": "", + "password": "%3A%40c", + "host": "d:2", + "hostname": "d", + "port": "2", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo.com:b@d/", + "base": "http://example.org/foo/bar", + "href": "http://foo.com:b@d/", + "origin": "http://d", + "protocol": "http:", + "username": "foo.com", + "password": "b", + "host": "d", + "hostname": "d", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo.com/\\@", + "base": "http://example.org/foo/bar", + "href": "http://foo.com//@", + "origin": "http://foo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.com", + "hostname": "foo.com", + "port": "", + "pathname": "//@", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://foo.com/", + "origin": "http://foo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.com", + "hostname": "foo.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\a\\b:c\\d@foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://a/b:c/d@foo.com/", + "origin": "http://a", + "protocol": "http:", + "username": "", + "password": "", + "host": "a", + "hostname": "a", + "port": "", + "pathname": "/b:c/d@foo.com/", + "search": "", + "hash": "" + }, + { + "input": "foo:/", + "base": "http://example.org/foo/bar", + "href": "foo:/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "foo:/bar.com/", + "base": "http://example.org/foo/bar", + "href": "foo:/bar.com/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/bar.com/", + "search": "", + "hash": "" + }, + { + "input": "foo://///////", + "base": "http://example.org/foo/bar", + "href": "foo://///////", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "///////", + "search": "", + "hash": "" + }, + { + "input": "foo://///////bar.com/", + "base": "http://example.org/foo/bar", + "href": "foo://///////bar.com/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "///////bar.com/", + "search": "", + "hash": "" + }, + { + "input": "foo:////://///", + "base": "http://example.org/foo/bar", + "href": "foo:////://///", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "//://///", + "search": "", + "hash": "" + }, + { + "input": "c:/foo", + "base": "http://example.org/foo/bar", + "href": "c:/foo", + "origin": "null", + "protocol": "c:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "//foo/bar", + "base": "http://example.org/foo/bar", + "href": "http://foo/bar", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/bar", + "search": "", + "hash": "" + }, + { + "input": "http://foo/path;a??e#f#g", + "base": "http://example.org/foo/bar", + "href": "http://foo/path;a??e#f#g", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/path;a", + "search": "??e", + "hash": "#f#g" + }, + { + "input": "http://foo/abcd?efgh?ijkl", + "base": "http://example.org/foo/bar", + "href": "http://foo/abcd?efgh?ijkl", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/abcd", + "search": "?efgh?ijkl", + "hash": "" + }, + { + "input": "http://foo/abcd#foo?bar", + "base": "http://example.org/foo/bar", + "href": "http://foo/abcd#foo?bar", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/abcd", + "search": "", + "hash": "#foo?bar" + }, + { + "input": "[61:24:74]:98", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/[61:24:74]:98", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/[61:24:74]:98", + "search": "", + "hash": "" + }, + { + "input": "http:[61:27]/:foo", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/[61:27]/:foo", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/[61:27]/:foo", + "search": "", + "hash": "" + }, + { + "input": "http://[1::2]:3:4", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1]", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1]:80", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://[2001::1]", + "base": "http://example.org/foo/bar", + "href": "http://[2001::1]/", + "origin": "http://[2001::1]", + "protocol": "http:", + "username": "", + "password": "", + "host": "[2001::1]", + "hostname": "[2001::1]", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://[2001::1]:80", + "base": "http://example.org/foo/bar", + "href": "http://[2001::1]/", + "origin": "http://[2001::1]", + "protocol": "http:", + "username": "", + "password": "", + "host": "[2001::1]", + "hostname": "[2001::1]", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/example.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/example.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftp:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:/example.com/", + "base": "http://example.org/foo/bar", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:/example.com/", + "base": "http://example.org/foo/bar", + "href": "madeupscheme:/example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "file:/example.com/", + "base": "http://example.org/foo/bar", + "href": "file:///example.com/", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ftps:/example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:/example.com/", + "base": "http://example.org/foo/bar", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:/example.com/", + "base": "http://example.org/foo/bar", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/example.com/", + "base": "http://example.org/foo/bar", + "href": "data:/example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/example.com/", + "base": "http://example.org/foo/bar", + "href": "javascript:/example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/example.com/", + "base": "http://example.org/foo/bar", + "href": "mailto:/example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "http:example.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/example.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftp:example.com/", + "base": "http://example.org/foo/bar", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:example.com/", + "base": "http://example.org/foo/bar", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:example.com/", + "base": "http://example.org/foo/bar", + "href": "madeupscheme:example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:example.com/", + "base": "http://example.org/foo/bar", + "href": "ftps:example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:example.com/", + "base": "http://example.org/foo/bar", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:example.com/", + "base": "http://example.org/foo/bar", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:example.com/", + "base": "http://example.org/foo/bar", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:example.com/", + "base": "http://example.org/foo/bar", + "href": "data:example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:example.com/", + "base": "http://example.org/foo/bar", + "href": "javascript:example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:example.com/", + "base": "http://example.org/foo/bar", + "href": "mailto:example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "/a/b/c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/b/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/b/c", + "search": "", + "hash": "" + }, + { + "input": "/a/ /c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/%20/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/%20/c", + "search": "", + "hash": "" + }, + { + "input": "/a%2fc", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a%2fc", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a%2fc", + "search": "", + "hash": "" + }, + { + "input": "/a/%2f/c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/%2f/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/%2f/c", + "search": "", + "hash": "" + }, + { + "input": "#β", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#β", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#β" + }, + { + "input": "data:text/html,test#test", + "base": "http://example.org/foo/bar", + "href": "data:text/html,test#test", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "text/html,test", + "search": "", + "hash": "#test" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/file.html", + { + "input": "file:c:\\foo\\bar.html", + "base": "file:///tmp/mock/path", + "href": "file:///c:/foo/bar.html", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:/foo/bar.html", + "search": "", + "hash": "" + }, + { + "input": " File:c|////foo\\bar.html", + "base": "file:///tmp/mock/path", + "href": "file:///c:////foo/bar.html", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:////foo/bar.html", + "search": "", + "hash": "" + }, + { + "input": "C|/foo/bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "/C|\\foo\\bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "//C|/foo/bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "//server/file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "\\\\server\\file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "/\\server/file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "file:///foo/bar.txt", + "base": "file:///tmp/mock/path", + "href": "file:///foo/bar.txt", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/foo/bar.txt", + "search": "", + "hash": "" + }, + { + "input": "file:///home/me", + "base": "file:///tmp/mock/path", + "href": "file:///home/me", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/home/me", + "search": "", + "hash": "" + }, + { + "input": "//", + "base": "file:///tmp/mock/path", + "href": "file:///", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "///", + "base": "file:///tmp/mock/path", + "href": "file:///", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "///test", + "base": "file:///tmp/mock/path", + "href": "file:///test", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/test", + "search": "", + "hash": "" + }, + { + "input": "file://test", + "base": "file:///tmp/mock/path", + "href": "file://test/", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "test", + "hostname": "test", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost", + "base": "file:///tmp/mock/path", + "href": "file:///", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost/", + "base": "file:///tmp/mock/path", + "href": "file:///", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost/test", + "base": "file:///tmp/mock/path", + "href": "file:///test", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/test", + "search": "", + "hash": "" + }, + { + "input": "test", + "base": "file:///tmp/mock/path", + "href": "file:///tmp/mock/test", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/tmp/mock/test", + "search": "", + "hash": "" + }, + { + "input": "file:test", + "base": "file:///tmp/mock/path", + "href": "file:///tmp/mock/test", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/tmp/mock/test", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/path.js", + { + "input": "http://example.com/././foo", + "base": "about:blank", + "href": "http://example.com/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/./.foo", + "base": "about:blank", + "href": "http://example.com/.foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/.foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/.", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/./", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/..", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/..bar", + "base": "about:blank", + "href": "http://example.com/foo/..bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/..bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../ton", + "base": "about:blank", + "href": "http://example.com/foo/ton", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/ton", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../ton/../../a", + "base": "about:blank", + "href": "http://example.com/a", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/a", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/../../..", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/../../../ton", + "base": "about:blank", + "href": "http://example.com/ton", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/ton", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e%2", + "base": "about:blank", + "href": "http://example.com/foo/.%2", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/.%2", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e./%2e%2e/.%2e/%2e.bar", + "base": "about:blank", + "href": "http://example.com/..bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/..bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com////../..", + "base": "about:blank", + "href": "http://example.com//", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "//", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar//../..", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar//..", + "base": "about:blank", + "href": "http://example.com/foo/bar/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/bar/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo", + "base": "about:blank", + "href": "http://example.com/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%20foo", + "base": "about:blank", + "href": "http://example.com/%20foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%20foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%", + "base": "about:blank", + "href": "http://example.com/foo%", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2", + "base": "about:blank", + "href": "http://example.com/foo%2", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2zbar", + "base": "about:blank", + "href": "http://example.com/foo%2zbar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2zbar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2©zbar", + "base": "about:blank", + "href": "http://example.com/foo%2%C3%82%C2%A9zbar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2%C3%82%C2%A9zbar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%41%7a", + "base": "about:blank", + "href": "http://example.com/foo%41%7a", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%41%7a", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo\t\u0091%91", + "base": "about:blank", + "href": "http://example.com/foo%C2%91%91", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%C2%91%91", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%00%51", + "base": "about:blank", + "href": "http://example.com/foo%00%51", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%00%51", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/(%28:%3A%29)", + "base": "about:blank", + "href": "http://example.com/(%28:%3A%29)", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/(%28:%3A%29)", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%3A%3a%3C%3c", + "base": "about:blank", + "href": "http://example.com/%3A%3a%3C%3c", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%3A%3a%3C%3c", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo\tbar", + "base": "about:blank", + "href": "http://example.com/foobar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foobar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com\\\\foo\\\\bar", + "base": "about:blank", + "href": "http://example.com//foo//bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "//foo//bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd", + "base": "about:blank", + "href": "http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%7Ffp3%3Eju%3Dduvgw%3Dd", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/@asdf%40", + "base": "about:blank", + "href": "http://example.com/@asdf%40", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/@asdf%40", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/你好你好", + "base": "about:blank", + "href": "http://example.com/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/‥/foo", + "base": "about:blank", + "href": "http://example.com/%E2%80%A5/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E2%80%A5/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com//foo", + "base": "about:blank", + "href": "http://example.com/%EF%BB%BF/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%EF%BB%BF/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/‮/foo/‭/bar", + "base": "about:blank", + "href": "http://example.com/%E2%80%AE/foo/%E2%80%AD/bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E2%80%AE/foo/%E2%80%AD/bar", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/relative.js", + { + "input": "http://www.google.com/foo?bar=baz#", + "base": "about:blank", + "href": "http://www.google.com/foo?bar=baz#", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "?bar=baz", + "hash": "" + }, + { + "input": "http://www.google.com/foo?bar=baz# »", + "base": "about:blank", + "href": "http://www.google.com/foo?bar=baz# »", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "?bar=baz", + "hash": "# »" + }, + { + "input": "data:test# »", + "base": "about:blank", + "href": "data:test# »", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "", + "hash": "# »" + }, + { + "input": "http://[www.google.com]/", + "base": "about:blank", + "failure": true + }, + { + "input": "http://www.google.com", + "base": "about:blank", + "href": "http://www.google.com/", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://192.0x00A80001", + "base": "about:blank", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://www/foo%2Ehtml", + "base": "about:blank", + "href": "http://www/foo.html", + "origin": "http://www", + "protocol": "http:", + "username": "", + "password": "", + "host": "www", + "hostname": "www", + "port": "", + "pathname": "/foo.html", + "search": "", + "hash": "" + }, + { + "input": "http://www/foo/%2E/html", + "base": "about:blank", + "href": "http://www/foo/html", + "origin": "http://www", + "protocol": "http:", + "username": "", + "password": "", + "host": "www", + "hostname": "www", + "port": "", + "pathname": "/foo/html", + "search": "", + "hash": "" + }, + { + "input": "http://user:pass@/", + "base": "about:blank", + "failure": true + }, + { + "input": "http://%25DOMAIN:foobar@foodomain.com/", + "base": "about:blank", + "href": "http://%25DOMAIN:foobar@foodomain.com/", + "origin": "http://foodomain.com", + "protocol": "http:", + "username": "%25DOMAIN", + "password": "foobar", + "host": "foodomain.com", + "hostname": "foodomain.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\www.google.com\\foo", + "base": "about:blank", + "href": "http://www.google.com/foo", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://foo:80/", + "base": "about:blank", + "href": "http://foo/", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo:81/", + "base": "about:blank", + "href": "http://foo:81/", + "origin": "http://foo:81", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "httpa://foo:80/", + "base": "about:blank", + "href": "httpa://foo:80/", + "origin": "null", + "protocol": "httpa:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo:-80/", + "base": "about:blank", + "failure": true + }, + { + "input": "https://foo:443/", + "base": "about:blank", + "href": "https://foo/", + "origin": "https://foo", + "protocol": "https:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https://foo:80/", + "base": "about:blank", + "href": "https://foo:80/", + "origin": "https://foo:80", + "protocol": "https:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp://foo:21/", + "base": "about:blank", + "href": "ftp://foo/", + "origin": "ftp://foo", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp://foo:80/", + "base": "about:blank", + "href": "ftp://foo:80/", + "origin": "ftp://foo:80", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "gopher://foo:70/", + "base": "about:blank", + "href": "gopher://foo/", + "origin": "gopher://foo", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "gopher://foo:443/", + "base": "about:blank", + "href": "gopher://foo:443/", + "origin": "gopher://foo:443", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "foo:443", + "hostname": "foo", + "port": "443", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:80/", + "base": "about:blank", + "href": "ws://foo/", + "origin": "ws://foo", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:81/", + "base": "about:blank", + "href": "ws://foo:81/", + "origin": "ws://foo:81", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:443/", + "base": "about:blank", + "href": "ws://foo:443/", + "origin": "ws://foo:443", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:443", + "hostname": "foo", + "port": "443", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:815/", + "base": "about:blank", + "href": "ws://foo:815/", + "origin": "ws://foo:815", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:815", + "hostname": "foo", + "port": "815", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:80/", + "base": "about:blank", + "href": "wss://foo:80/", + "origin": "wss://foo:80", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:81/", + "base": "about:blank", + "href": "wss://foo:81/", + "origin": "wss://foo:81", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:443/", + "base": "about:blank", + "href": "wss://foo/", + "origin": "wss://foo", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:815/", + "base": "about:blank", + "href": "wss://foo:815/", + "origin": "wss://foo:815", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:815", + "hostname": "foo", + "port": "815", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/example.com/", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp:/example.com/", + "base": "about:blank", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:/example.com/", + "base": "about:blank", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:/example.com/", + "base": "about:blank", + "href": "madeupscheme:/example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "file:/example.com/", + "base": "about:blank", + "href": "file:///example.com/", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:/example.com/", + "base": "about:blank", + "href": "ftps:/example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:/example.com/", + "base": "about:blank", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:/example.com/", + "base": "about:blank", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:/example.com/", + "base": "about:blank", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/example.com/", + "base": "about:blank", + "href": "data:/example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/example.com/", + "base": "about:blank", + "href": "javascript:/example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/example.com/", + "base": "about:blank", + "href": "mailto:/example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "http:example.com/", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp:example.com/", + "base": "about:blank", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:example.com/", + "base": "about:blank", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:example.com/", + "base": "about:blank", + "href": "madeupscheme:example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:example.com/", + "base": "about:blank", + "href": "ftps:example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:example.com/", + "base": "about:blank", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:example.com/", + "base": "about:blank", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:example.com/", + "base": "about:blank", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:example.com/", + "base": "about:blank", + "href": "data:example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:example.com/", + "base": "about:blank", + "href": "javascript:example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:example.com/", + "base": "about:blank", + "href": "mailto:example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/segments-userinfo-vs-host.html", + { + "input": "http:@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://@pple.com", + "base": "about:blank", + "href": "http://pple.com/", + "origin": "http://pple.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "pple.com", + "hostname": "pple.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http::b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/:b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://:b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://user@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "https:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http::@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://www.@pple.com", + "base": "about:blank", + "href": "http://www.@pple.com/", + "origin": "http://pple.com", + "protocol": "http:", + "username": "www.", + "password": "", + "host": "pple.com", + "hostname": "pple.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://:@www.example.com", + "base": "about:blank", + "href": "http://:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# Others", + { + "input": "/", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": ".", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "..", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "./test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../aaa/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/aaa/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/aaa/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../../test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "中/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/%E4%B8%AD/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/%E4%B8%AD/test.txt", + "search": "", + "hash": "" + }, + { + "input": "http://www.example2.com", + "base": "http://www.example.com/test", + "href": "http://www.example2.com/", + "origin": "http://www.example2.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example2.com", + "hostname": "www.example2.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "//www.example2.com", + "base": "http://www.example.com/test", + "href": "http://www.example2.com/", + "origin": "http://www.example2.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example2.com", + "hostname": "www.example2.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:...", + "base": "http://www.example.com/test", + "href": "file:///...", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/...", + "search": "", + "hash": "" + }, + { + "input": "file:..", + "base": "http://www.example.com/test", + "href": "file:///", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:a", + "base": "http://www.example.com/test", + "href": "file:///a", + "origin": "file://", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/a", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/host.html", + "Basic canonicalization, uppercase should be converted to lowercase", + { + "input": "http://ExAmPlE.CoM", + "base": "http://other.com/", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://example example.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://Goo%20 goo%7C|.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://[]", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://[:]", + "base": "http://other.com/", + "failure": true + }, + "U+3000 is mapped to U+0020 (space) which is disallowed", + { + "input": "http://GOO\u00a0\u3000goo.com", + "base": "http://other.com/", + "failure": true + }, + "Other types of space (no-break, zero-width, zero-width-no-break) are name-prepped away to nothing. U+200B, U+2060, and U+FEFF, are ignored", + { + "input": "http://GOO\u200b\u2060\ufeffgoo.com", + "base": "http://other.com/", + "href": "http://googoo.com/", + "origin": "http://googoo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "googoo.com", + "hostname": "googoo.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Ideographic full stop (full-width period for Chinese, etc.) should be treated as a dot. U+3002 is mapped to U+002E (dot)", + { + "input": "http://www.foo。bar.com", + "base": "http://other.com/", + "href": "http://www.foo.bar.com/", + "origin": "http://www.foo.bar.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.foo.bar.com", + "hostname": "www.foo.bar.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Invalid unicode characters should fail... U+FDD0 is disallowed; %ef%b7%90 is U+FDD0", + { + "input": "http://\ufdd0zyx.com", + "base": "http://other.com/", + "failure": true + }, + "This is the same as previous but escaped", + { + "input": "http://%ef%b7%90zyx.com", + "base": "http://other.com/", + "failure": true + }, + "Test name prepping, fullwidth input should be converted to ASCII and NOT IDN-ized. This is 'Go' in fullwidth UTF-8/UTF-16.", + { + "input": "http://Go.com", + "base": "http://other.com/", + "href": "http://go.com/", + "origin": "http://go.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "go.com", + "hostname": "go.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "URL spec forbids the following. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24257", + { + "input": "http://%41.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://%ef%bc%85%ef%bc%94%ef%bc%91.com", + "base": "http://other.com/", + "failure": true + }, + "...%00 in fullwidth should fail (also as escaped UTF-8 input)", + { + "input": "http://%00.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://%ef%bc%85%ef%bc%90%ef%bc%90.com", + "base": "http://other.com/", + "failure": true + }, + "Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN", + { + "input": "http://你好你好", + "base": "http://other.com/", + "href": "http://xn--6qqa088eba/", + "origin": "http://你好你好", + "protocol": "http:", + "username": "", + "password": "", + "host": "xn--6qqa088eba", + "hostname": "xn--6qqa088eba", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Invalid escaped characters should fail and the percents should be escaped. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24191", + { + "input": "http://%zz%66%a.com", + "base": "http://other.com/", + "failure": true + }, + "If we get an invalid character that has been escaped.", + { + "input": "http://%25", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://hello%00", + "base": "http://other.com/", + "failure": true + }, + "Escaped numbers should be treated like IP addresses if they are.", + { + "input": "http://%30%78%63%30%2e%30%32%35%30.01", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://%30%78%63%30%2e%30%32%35%30.01%2e", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://192.168.0.257", + "base": "http://other.com/", + "failure": true + }, + "Invalid escaping should trigger the regular host error handling", + { + "input": "http://%3g%78%63%30%2e%30%32%35%30%2E.01", + "base": "http://other.com/", + "failure": true + }, + "Something that isn't exactly an IP should get treated as a host and spaces escaped", + { + "input": "http://192.168.0.1 hello", + "base": "http://other.com/", + "failure": true + }, + "Fullwidth and escaped UTF-8 fullwidth should still be treated as IP", + { + "input": "http://0Xc0.0250.01", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Broken IPv6", + { + "input": "http://[google.com]", + "base": "http://other.com/", + "failure": true + }, + "Misc Unicode", + { + "input": "http://foo:💩@example.com/bar", + "base": "http://other.com/", + "href": "http://foo:%F0%9F%92%A9@example.com/bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "foo", + "password": "%F0%9F%92%A9", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/bar", + "search": "", + "hash": "" + }, + "# resolving a fragment against any scheme succeeds", + { + "input": "#", + "base": "test:test", + "href": "test:test#", + "origin": "null", + "protocol": "test:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "", + "hash": "" + }, + { + "input": "#x", + "base": "mailto:x@x.com", + "href": "mailto:x@x.com#x", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "x@x.com", + "search": "", + "hash": "#x" + }, + { + "input": "#x", + "base": "data:,", + "href": "data:,#x", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": ",", + "search": "", + "hash": "#x" + }, + { + "input": "#x", + "base": "about:blank", + "href": "about:blank#x", + "origin": "null", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "blank", + "search": "", + "hash": "#x" + }, + { + "input": "#", + "base": "test:test?test", + "href": "test:test?test#", + "origin": "null", + "protocol": "test:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "?test", + "hash": "" + }, + "# multiple @ in authority state", + { + "input": "https://@test@test@example:800/", + "base": "http://doesnotmatter/", + "href": "https://%40test%40test@example:800/", + "origin": "https://example:800", + "protocol": "https:", + "username": "%40test%40test", + "password": "", + "host": "example:800", + "hostname": "example", + "port": "800", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https://@@@example", + "base": "http://doesnotmatter/", + "href": "https://%40%40@example/", + "origin": "https://example", + "protocol": "https:", + "username": "%40%40", + "password": "", + "host": "example", + "hostname": "example", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "non-az-09 characters", + { + "input": "http://`{}:`{}@h/`{}?`{}", + "base": "http://doesnotmatter/", + "href": "http://%60%7B%7D:%60%7B%7D@h/%60%7B%7D?`{}", + "origin": "http://h", + "protocol": "http:", + "username": "%60%7B%7D", + "password": "%60%7B%7D", + "host": "h", + "hostname": "h", + "port": "", + "pathname": "/%60%7B%7D", + "search": "?`{}", + "hash": "" + }, + "# Credentials in base", + { + "input": "/some/path", + "base": "http://user@example.org/smth", + "href": "http://user@example.org/some/path", + "origin": "http://example.org", + "protocol": "http:", + "username": "user", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/some/path", + "search": "", + "hash": "" + }, + { + "input": "", + "base": "http://user:pass@example.org:21/smth", + "href": "http://user:pass@example.org:21/smth", + "origin": "http://example.org:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "example.org:21", + "hostname": "example.org", + "port": "21", + "pathname": "/smth", + "search": "", + "hash": "" + }, + { + "input": "/some/path", + "base": "http://user:pass@example.org:21/smth", + "href": "http://user:pass@example.org:21/some/path", + "origin": "http://example.org:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "example.org:21", + "hostname": "example.org", + "port": "21", + "pathname": "/some/path", + "search": "", + "hash": "" + }, + "# a set of tests designed by zcorpan for relative URLs with unknown schemes", + { + "input": "i", + "base": "sc:sd", + "failure": true + }, + { + "input": "i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "i", + "base": "sc:/pa/pa", + "href": "sc:/pa/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/i", + "search": "", + "hash": "" + }, + { + "input": "i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "i", + "base": "sc:///pa/pa", + "href": "sc:///pa/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc:sd", + "failure": true + }, + { + "input": "../i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "../i", + "base": "sc:/pa/pa", + "href": "sc:/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc:///pa/pa", + "href": "sc:///i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc:sd", + "failure": true + }, + { + "input": "/i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "/i", + "base": "sc:/pa/pa", + "href": "sc:/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc:///pa/pa", + "href": "sc:///i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "?i", + "base": "sc:sd", + "failure": true + }, + { + "input": "?i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "?i", + "base": "sc:/pa/pa", + "href": "sc:/pa/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "?i", + "hash": "" + }, + { + "input": "?i", + "base": "sc://ho/pa", + "href": "sc://ho/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/pa", + "search": "?i", + "hash": "" + }, + { + "input": "?i", + "base": "sc:///pa/pa", + "href": "sc:///pa/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "?i", + "hash": "" + }, + { + "input": "#i", + "base": "sc:sd", + "href": "sc:sd#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "sd", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:sd/sd", + "href": "sc:sd/sd#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "sd/sd", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:/pa/pa", + "href": "sc:/pa/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc://ho/pa", + "href": "sc://ho/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/pa", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:///pa/pa", + "href": "sc:///pa/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "", + "hash": "#i" + }, + "# make sure that relative URL logic works on known typically non-relative schemes too", + { + "input": "about:/../", + "base": "about:blank", + "href": "about:/", + "origin": "null", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/../", + "base": "about:blank", + "href": "data:/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/../", + "base": "about:blank", + "href": "javascript:/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/../", + "base": "about:blank", + "href": "mailto:/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# unknown schemes and non-ASCII domains", + { + "input": "sc://ñ.test/", + "base": "about:blank", + "href": "sc://xn--ida.test/", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "xn--ida.test", + "hostname": "xn--ida.test", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# unknown schemes and backslashes", + { + "input": "sc:\\../", + "base": "about:blank", + "href": "sc:\\../", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "\\../", + "search": "", + "hash": "" + }, + "# tests from jsdom/whatwg-url designed for code coverage", + { + "input": "http://127.0.0.1:10100/relative_import.html", + "base": "about:blank", + "href": "http://127.0.0.1:10100/relative_import.html", + "origin": "http://127.0.0.1:10100", + "protocol": "http:", + "username": "", + "password": "", + "host": "127.0.0.1:10100", + "hostname": "127.0.0.1", + "port": "10100", + "pathname": "/relative_import.html", + "search": "", + "hash": "" + }, + { + "input": "http://facebook.com/?foo=%7B%22abc%22", + "base": "about:blank", + "href": "http://facebook.com/?foo=%7B%22abc%22", + "origin": "http://facebook.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "facebook.com", + "hostname": "facebook.com", + "port": "", + "pathname": "/", + "search": "?foo=%7B%22abc%22", + "hash": "" + }, + { + "input": "https://localhost:3000/jqueryui@1.2.3", + "base": "about:blank", + "href": "https://localhost:3000/jqueryui@1.2.3", + "origin": "https://localhost:3000", + "protocol": "https:", + "username": "", + "password": "", + "host": "localhost:3000", + "hostname": "localhost", + "port": "3000", + "pathname": "/jqueryui@1.2.3", + "search": "", + "hash": "" + } +] diff --git a/tests/urltestdata.txt b/tests/urltestdata.txt deleted file mode 100644 index 29bf4b0c..00000000 --- a/tests/urltestdata.txt +++ /dev/null @@ -1,329 +0,0 @@ -# This file is from https://github.com/w3c/web-platform-tests/blob/master/url/urltestdata.txt -# and used under a 3-clause BSD license. - -# FORMAT NOT DOCUMENTED YET (parser is urltestparser.js) -# https://github.com/w3c/web-platform-tests/blob/master/url/urltestparser.js - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/segments.js -http://example\t.\norg http://example.org/foo/bar s:http h:example.org p:/ -http://user:pass@foo:21/bar;par?b#c s:http u:user pass:pass h:foo port:21 p:/bar;par q:?b f:#c -http:foo.com s:http h:example.org p:/foo/foo.com -\t\s\s\s:foo.com\s\s\s\n s:http h:example.org p:/foo/:foo.com -\sfoo.com\s\s s:http h:example.org p:/foo/foo.com -a:\t\sfoo.com s:a p:\sfoo.com -http://f:21/\sb\s?\sd\s#\se\s s:http h:f port:21 p:/%20b%20 q:?%20d%20 f:#\se -http://f:/c s:http h:f p:/c -http://f:0/c s:http h:f port:0 p:/c -http://f:00000000000000/c s:http h:f port:0 p:/c -http://f:00000000000000000000080/c s:http h:f p:/c -http://f:b/c -http://f:\s/c -http://f:\n/c s:http h:f p:/c -http://f:fifty-two/c -http://f:9999/c s:http h:f port:9999 p:/c -http://f:\s21\s/\sb\s?\sd\s#\se\s - s:http h:example.org p:/foo/bar -\s\s\t s:http h:example.org p:/foo/bar -:foo.com/ s:http h:example.org p:/foo/:foo.com/ -:foo.com\\ s:http h:example.org p:/foo/:foo.com/ -: s:http h:example.org p:/foo/: -:a s:http h:example.org p:/foo/:a -:/ s:http h:example.org p:/foo/:/ -:\\ s:http h:example.org p:/foo/:/ -:# s:http h:example.org p:/foo/: f:# -# s:http h:example.org p:/foo/bar f:# -#/ s:http h:example.org p:/foo/bar f:#/ -#\\ s:http h:example.org p:/foo/bar f:#\\ -#;? s:http h:example.org p:/foo/bar f:#;? -? s:http h:example.org p:/foo/bar q:? -/ s:http h:example.org p:/ -:23 s:http h:example.org p:/foo/:23 -/:23 s:http h:example.org p:/:23 -:: s:http h:example.org p:/foo/:: -::23 s:http h:example.org p:/foo/::23 -foo:// s:foo p:/ -http://a:b@c:29/d s:http u:a pass:b h:c port:29 p:/d -http::@c:29 s:http h:example.org p:/foo/:@c:29 -http://&a:foo(b]c@d:2/ s:http u:&a pass:foo(b%5Dc h:d port:2 p:/ -http://::@c@d:2 s:http pass:%3A%40c h:d port:2 p:/ -http://foo.com:b@d/ s:http u:foo.com pass:b h:d p:/ -http://foo.com/\\@ s:http h:foo.com p://@ -http:\\\\foo.com\\ s:http h:foo.com p:/ -http:\\\\a\\b:c\\d@foo.com\\ s:http h:a p:/b:c/d@foo.com/ -foo:/ s:foo p:/ -foo:/bar.com/ s:foo p:/bar.com/ -foo:///////// s:foo p://///// -foo://///////bar.com/ s:foo p:///////bar.com/ -foo:////:///// s:foo p://:///// -c:/foo s:c p:/foo -//foo/bar s:http h:foo p:/bar -http://foo/path;a??e#f#g s:http h:foo p:/path;a q:??e f:#f#g -http://foo/abcd?efgh?ijkl s:http h:foo p:/abcd q:?efgh?ijkl -http://foo/abcd#foo?bar s:http h:foo p:/abcd f:#foo?bar -[61:24:74]:98 s:http h:example.org p:/foo/[61:24:74]:98 -http:[61:27]/:foo s:http h:example.org p:/foo/[61:27]/:foo -http://[1::2]:3:4 -http://2001::1 -http://2001::1] -http://2001::1]:80 -http://[2001::1] s:http h:[2001::1] p:/ -http://[2001::1]:80 s:http h:[2001::1] p:/ -http:/example.com/ s:http h:example.org p:/example.com/ -ftp:/example.com/ s:ftp h:example.com p:/ -https:/example.com/ s:https h:example.com p:/ -madeupscheme:/example.com/ s:madeupscheme p:/example.com/ -file:/example.com/ s:file p:/example.com/ -ftps:/example.com/ s:ftps p:/example.com/ -gopher:/example.com/ s:gopher h:example.com p:/ -ws:/example.com/ s:ws h:example.com p:/ -wss:/example.com/ s:wss h:example.com p:/ -data:/example.com/ s:data p:/example.com/ -javascript:/example.com/ s:javascript p:/example.com/ -mailto:/example.com/ s:mailto p:/example.com/ -http:example.com/ s:http h:example.org p:/foo/example.com/ -ftp:example.com/ s:ftp h:example.com p:/ -https:example.com/ s:https h:example.com p:/ -madeupscheme:example.com/ s:madeupscheme p:example.com/ -ftps:example.com/ s:ftps p:example.com/ -gopher:example.com/ s:gopher h:example.com p:/ -ws:example.com/ s:ws h:example.com p:/ -wss:example.com/ s:wss h:example.com p:/ -data:example.com/ s:data p:example.com/ -javascript:example.com/ s:javascript p:example.com/ -mailto:example.com/ s:mailto p:example.com/ -/a/b/c s:http h:example.org p:/a/b/c -/a/\s/c s:http h:example.org p:/a/%20/c -/a%2fc s:http h:example.org p:/a%2fc -/a/%2f/c s:http h:example.org p:/a/%2f/c -#\u03B2 s:http h:example.org p:/foo/bar f:#\u03B2 -data:text/html,test#test s:data p:text/html,test f:#test - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/file.html -file:c:\\foo\\bar.html file:///tmp/mock/path s:file p:/c:/foo/bar.html -\s\sFile:c|////foo\\bar.html s:file p:/c:////foo/bar.html -C|/foo/bar s:file p:/C:/foo/bar -/C|\\foo\\bar s:file p:/C:/foo/bar -//C|/foo/bar s:file p:/C:/foo/bar -//server/file s:file h:server p:/file -\\\\server\\file s:file h:server p:/file -/\\server/file s:file h:server p:/file -file:///foo/bar.txt s:file p:/foo/bar.txt -file:///home/me s:file p:/home/me -// s:file p:/ -/// s:file p:/ -///test s:file p:/test -file://test s:file h:test p:/ -file://localhost s:file p:/ -file://localhost/ s:file p:/ -file://localhost/test s:file p:/test -test s:file p:/tmp/mock/test -file:test s:file p:/tmp/mock/test - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/path.js -http://example.com/././foo about:blank s:http h:example.com p:/foo -http://example.com/./.foo s:http h:example.com p:/.foo -http://example.com/foo/. s:http h:example.com p:/foo/ -http://example.com/foo/./ s:http h:example.com p:/foo/ -http://example.com/foo/bar/.. s:http h:example.com p:/foo/ -http://example.com/foo/bar/../ s:http h:example.com p:/foo/ -http://example.com/foo/..bar s:http h:example.com p:/foo/..bar -http://example.com/foo/bar/../ton s:http h:example.com p:/foo/ton -http://example.com/foo/bar/../ton/../../a s:http h:example.com p:/a -http://example.com/foo/../../.. s:http h:example.com p:/ -http://example.com/foo/../../../ton s:http h:example.com p:/ton -http://example.com/foo/%2e s:http h:example.com p:/foo/ -http://example.com/foo/%2e%2 s:http h:example.com p:/foo/%2e%2 -http://example.com/foo/%2e./%2e%2e/.%2e/%2e.bar s:http h:example.com p:/%2e.bar -http://example.com////../.. s:http h:example.com p:// -http://example.com/foo/bar//../.. s:http h:example.com p:/foo/ -http://example.com/foo/bar//.. s:http h:example.com p:/foo/bar/ -http://example.com/foo s:http h:example.com p:/foo -http://example.com/%20foo s:http h:example.com p:/%20foo -http://example.com/foo% s:http h:example.com p:/foo% -http://example.com/foo%2 s:http h:example.com p:/foo%2 -http://example.com/foo%2zbar s:http h:example.com p:/foo%2zbar -http://example.com/foo%2\u00C2\u00A9zbar s:http h:example.com p:/foo%2%C3%82%C2%A9zbar -http://example.com/foo%41%7a s:http h:example.com p:/foo%41%7a -http://example.com/foo\t\u0091%91 s:http h:example.com p:/foo%C2%91%91 -http://example.com/foo%00%51 s:http h:example.com p:/foo%00%51 -http://example.com/(%28:%3A%29) s:http h:example.com p:/(%28:%3A%29) -http://example.com/%3A%3a%3C%3c s:http h:example.com p:/%3A%3a%3C%3c -http://example.com/foo\tbar s:http h:example.com p:/foobar -http://example.com\\\\foo\\\\bar s:http h:example.com p://foo//bar -http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd s:http h:example.com p:/%7Ffp3%3Eju%3Dduvgw%3Dd -http://example.com/@asdf%40 s:http h:example.com p:/@asdf%40 -http://example.com/\u4F60\u597D\u4F60\u597D s:http h:example.com p:/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD -http://example.com/\u2025/foo s:http h:example.com p:/%E2%80%A5/foo -http://example.com/\uFEFF/foo s:http h:example.com p:/%EF%BB%BF/foo -http://example.com/\u202E/foo/\u202D/bar s:http h:example.com p:/%E2%80%AE/foo/%E2%80%AD/bar - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/relative.js -http://www.google.com/foo?bar=baz# about:blank s:http h:www.google.com p:/foo q:?bar=baz f:# -http://www.google.com/foo?bar=baz#\s\u00BB s:http h:www.google.com p:/foo q:?bar=baz f:#\s%C2%BB -http://[www.google.com]/ -http://www.google.com s:http h:www.google.com p:/ -http://192.0x00A80001 s:http h:192.168.0.1 p:/ -http://www/foo%2Ehtml s:http h:www p:/foo%2Ehtml -http://www/foo/%2E/html s:http h:www p:/foo/html -http://user:pass@/ -http://%25DOMAIN:foobar@foodomain.com/ s:http u:%25DOMAIN pass:foobar h:foodomain.com p:/ -http:\\\\www.google.com\\foo s:http h:www.google.com p:/foo -http://foo:80/ s:http h:foo p:/ -http://foo:81/ s:http h:foo port:81 p:/ -httpa://foo:80/ s:httpa h:foo port:80 p:/ -http://foo:-80/ -https://foo:443/ s:https h:foo p:/ -https://foo:80/ s:https h:foo port:80 p:/ -ftp://foo:21/ s:ftp h:foo p:/ -ftp://foo:80/ s:ftp h:foo port:80 p:/ -gopher://foo:70/ s:gopher h:foo p:/ -gopher://foo:443/ s:gopher h:foo port:443 p:/ -ws://foo:80/ s:ws h:foo p:/ -ws://foo:81/ s:ws h:foo port:81 p:/ -ws://foo:443/ s:ws h:foo port:443 p:/ -ws://foo:815/ s:ws h:foo port:815 p:/ -wss://foo:80/ s:wss h:foo port:80 p:/ -wss://foo:81/ s:wss h:foo port:81 p:/ -wss://foo:443/ s:wss h:foo p:/ -wss://foo:815/ s:wss h:foo port:815 p:/ -http:/example.com/ s:http h:example.com p:/ -ftp:/example.com/ s:ftp h:example.com p:/ -https:/example.com/ s:https h:example.com p:/ -madeupscheme:/example.com/ s:madeupscheme p:/example.com/ -file:/example.com/ s:file p:/example.com/ -ftps:/example.com/ s:ftps p:/example.com/ -gopher:/example.com/ s:gopher h:example.com p:/ -ws:/example.com/ s:ws h:example.com p:/ -wss:/example.com/ s:wss h:example.com p:/ -data:/example.com/ s:data p:/example.com/ -javascript:/example.com/ s:javascript p:/example.com/ -mailto:/example.com/ s:mailto p:/example.com/ -http:example.com/ s:http h:example.com p:/ -ftp:example.com/ s:ftp h:example.com p:/ -https:example.com/ s:https h:example.com p:/ -madeupscheme:example.com/ s:madeupscheme p:example.com/ -ftps:example.com/ s:ftps p:example.com/ -gopher:example.com/ s:gopher h:example.com p:/ -ws:example.com/ s:ws h:example.com p:/ -wss:example.com/ s:wss h:example.com p:/ -data:example.com/ s:data p:example.com/ -javascript:example.com/ s:javascript p:example.com/ -mailto:example.com/ s:mailto p:example.com/ - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/segments-userinfo-vs-host.html -http:@www.example.com about:blank s:http h:www.example.com p:/ -http:/@www.example.com s:http h:www.example.com p:/ -http://@www.example.com s:http h:www.example.com p:/ -http:a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http:/a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http://a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http://@pple.com s:http h:pple.com p:/ -http::b@www.example.com s:http pass:b h:www.example.com p:/ -http:/:b@www.example.com s:http pass:b h:www.example.com p:/ -http://:b@www.example.com s:http pass:b h:www.example.com p:/ -http:/:@/www.example.com -http://user@/www.example.com -http:@/www.example.com -http:/@/www.example.com -http://@/www.example.com -https:@/www.example.com -http:a:b@/www.example.com -http:/a:b@/www.example.com -http://a:b@/www.example.com -http::@/www.example.com -http:a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http:/a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http://a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http://www.@pple.com s:http u:www. h:pple.com p:/ -http:@:www.example.com -http:/@:www.example.com -http://@:www.example.com -http://:@www.example.com s:http pass: h:www.example.com p:/ - -#Others -/ http://www.example.com/test s:http h:www.example.com p:/ -/test.txt s:http h:www.example.com p:/test.txt -. s:http h:www.example.com p:/ -.. s:http h:www.example.com p:/ -test.txt s:http h:www.example.com p:/test.txt -./test.txt s:http h:www.example.com p:/test.txt -../test.txt s:http h:www.example.com p:/test.txt -../aaa/test.txt s:http h:www.example.com p:/aaa/test.txt -../../test.txt s:http h:www.example.com p:/test.txt -\u4E2D/test.txt s:http h:www.example.com p:/%E4%B8%AD/test.txt -http://www.example2.com s:http h:www.example2.com p:/ - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/host.html - -# Basic canonicalization, uppercase should be converted to lowercase -http://ExAmPlE.CoM http://other.com/ s:http p:/ h:example.com - -# Spaces should fail -http://example\sexample.com - -# This should fail -http://Goo%20\sgoo%7C|.com - -# This should fail -http://GOO\u00a0\u3000goo.com - -# This should fail -http://[] -http://[:] - -# Other types of space (no-break, zero-width, zero-width-no-break) are -# name-prepped away to nothing. -http://GOO\u200b\u2060\ufeffgoo.com s:http p:/ h:googoo.com - -# Ideographic full stop (full-width period for Chinese, etc.) should be -# treated as a dot. -http://www.foo\u3002bar.com s:http p:/ h:www.foo.bar.com - -# Invalid unicode characters should fail... -http://\ufdd0zyx.com - -# ...This is the same as previous but with with escaped. -http://%ef%b7%90zyx.com - -# Test name prepping, fullwidth input should be converted to ASCII and NOT -# IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. -http://\uff27\uff4f.com s:http p:/ h:go.com - -# URL spec forbids the following. -# https://www.w3.org/Bugs/Public/show_bug.cgi?id=24257 -http://\uff05\uff14\uff11.com -http://%ef%bc%85%ef%bc%94%ef%bc%91.com - -# ...%00 in fullwidth should fail (also as escaped UTF-8 input) -http://\uff05\uff10\uff10.com -http://%ef%bc%85%ef%bc%90%ef%bc%90.com - -# Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN -http://\u4f60\u597d\u4f60\u597d s:http p:/ h:xn--6qqa088eba - -# Invalid escaped characters should fail and the percents should be -# escaped. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24191 -http://%zz%66%a.com - -# If we get an invalid character that has been escaped. -http://%25 -http://hello%00 - -# Escaped numbers should be treated like IP addresses if they are. -http://%30%78%63%30%2e%30%32%35%30.01 s:http p:/ h:192.168.0.1 -http://%30%78%63%30%2e%30%32%35%30.01%2e s:http p:/ h:192.168.0.1 - -# Invalid escaping should trigger the regular host error handling. -http://%3g%78%63%30%2e%30%32%35%30%2E.01 - -# Something that isn't exactly an IP should get treated as a host and -# spaces escaped. -http://192.168.0.1\shello - -# Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. -# These are "0Xc0.0250.01" in fullwidth. -http://\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11 s:http p:/ h:192.168.0.1 - -# Broken IP addresses. -http://192.168.0.257 -http://[google.com] diff --git a/tests/wpt.rs b/tests/wpt.rs index 6f9f4021..18e43aca 100644 --- a/tests/wpt.rs +++ b/tests/wpt.rs @@ -8,182 +8,98 @@ //! Tests copied form https://github.com/w3c/web-platform-tests/blob/master/url/ +extern crate rustc_serialize; extern crate test; extern crate url; -use std::char; -use url::Url; +use rustc_serialize::json::Json; +use url::{Url, WebIdl}; -fn run_one(entry: Entry) { - let Entry { - input, - base, - scheme: expected_scheme, - username: expected_username, - password: expected_password, - host: expected_host, - port: expected_port, - path: expected_path, - query: expected_query, - fragment: expected_fragment, - expected_failure, - } = entry; +fn run_one(input: String, base: String, expected: Result) { let base = match Url::parse(&base) { Ok(base) => base, - Err(message) => panic!("Error parsing base {}: {}", base, message) + Err(message) => panic!("Error parsing base {:?}: {}", base, message) }; - let expecting_err = expected_scheme.is_none() ^ expected_failure; - let url = match base.join(&input) { - Ok(url) => url, - Err(reason) => { - assert!(expecting_err, "Error parsing URL {}: {}", input, reason); - return - } + let (url, expected) = match (base.join(&input), expected) { + (Ok(url), Ok(expected)) => (url, expected), + (Err(_), Err(())) => return, + (Err(message), Ok(_)) => panic!("Error parsing URL {:?}: {}", input, message), + (Ok(_), Err(())) => panic!("Expected a parse error for URL {:?}", input), }; - assert!(!expecting_err, "Expected a parse error for URL {}", input); - macro_rules! assert_eq { - ($a: expr, $b: expr) => { + macro_rules! assert_getter { + ($attribute: ident) => { { - let a = $a; - let b = $b; - if a != b { - if expected_failure { - return - } else { - panic!("{:?} != {:?} for {:?}", a, b, url) - } - } + let a = WebIdl::$attribute(&url); + let b = expected.$attribute; + assert!(a == b, "{:?} != {:?} for URL {:?}", a, b, url); } } } - assert_eq!(Some(url.scheme().to_owned()), expected_scheme); - assert_eq!(url.username(), expected_username); - assert_eq!(url.password().map(|s| s.to_owned()), expected_password); - assert_eq!(url.host_str().unwrap_or("").to_owned(), expected_host); - assert_eq!(url.port(), expected_port); - assert_eq!(Some(url.path().to_owned()), expected_path); - assert_eq!(url.query().map(|s| format!("?{}", s)), expected_query); - assert_eq!(url.fragment().map(|s| format!("#{}", s)), expected_fragment); - - assert!(!expected_failure, "Unexpected success for {}", input); + assert_getter!(href); + //assert_getter!(origin); FIXME + assert_getter!(protocol); + assert_getter!(username); + assert_getter!(password); + assert_getter!(host); + assert_getter!(hostname); + assert_getter!(port); + assert_getter!(pathname); + assert_getter!(search); + assert_getter!(hash); } -struct Entry { - input: String, - base: String, - scheme: Option, +struct TestCase { + href: String, + origin: String, + protocol: String, username: String, - password: Option, + password: String, host: String, - port: Option, - path: Option, - query: Option, - fragment: Option, - expected_failure: bool, + hostname: String, + port: String, + pathname: String, + search: String, + hash: String, } -fn parse_test_data(input: &str) -> Vec { - let mut tests: Vec = Vec::new(); - for line in input.lines() { - if line == "" || line.starts_with("#") { - continue - } - let mut pieces = line.split(' ').collect::>(); - let expected_failure = pieces[0] == "XFAIL"; - if expected_failure { - pieces.remove(0); +fn main() { + let json = Json::from_str(include_str!("urltestdata.json")) + .expect("JSON parse error in urltestdata.json"); + let tests = json.as_array().unwrap().iter().filter_map(|entry| { + if entry.is_string() { + return None // ignore comments } - let input = unescape(pieces.remove(0)); - let mut test = Entry { - input: input, - base: if pieces.is_empty() || pieces[0] == "" { - tests.last().unwrap().base.clone() - } else { - unescape(pieces.remove(0)) - }, - scheme: None, - username: String::new(), - password: None, - host: String::new(), - port: None, - path: None, - query: None, - fragment: None, - expected_failure: expected_failure, + let string = |key| entry.find(key).unwrap().as_string().unwrap().to_owned(); + let base = string("base"); + let input = string("input"); + let expected = if entry.find("failure").is_some() { + Err(()) + } else { + Ok(TestCase { + href: string("href"), + origin: string("origin"), + protocol: string("protocol"), + username: string("username"), + password: string("password"), + host: string("host"), + hostname: string("hostname"), + port: string("port"), + pathname: string("pathname"), + search: string("search"), + hash: string("hash"), + }) }; - for piece in pieces { - if piece == "" || piece.starts_with("#") { - continue - } - let colon = piece.find(':').unwrap(); - let value = unescape(&piece[colon + 1..]); - match &piece[..colon] { - "s" => test.scheme = Some(value), - "u" => test.username = value, - "pass" => test.password = Some(value), - "h" => test.host = value, - "port" => test.port = Some(value.parse().unwrap()), - "p" => test.path = Some(value), - "q" => test.query = Some(value), - "f" => test.fragment = Some(value), - _ => panic!("Invalid token") - } - } - tests.push(test) - } - tests -} - -fn unescape(input: &str) -> String { - let mut output = String::new(); - let mut chars = input.chars(); - loop { - match chars.next() { - None => return output, - Some(c) => output.push( - if c == '\\' { - match chars.next().unwrap() { - '\\' => '\\', - 'n' => '\n', - 'r' => '\r', - 's' => ' ', - 't' => '\t', - 'f' => '\x0C', - 'u' => { - char::from_u32(((( - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()).unwrap() - } - _ => panic!("Invalid test data input"), - } - } else { - c - } - ) - } - } -} - -fn make_test(entry: Entry) -> test::TestDescAndFn { - test::TestDescAndFn { - desc: test::TestDesc { - name: test::DynTestName(format!("{:?} base {:?}", entry.input, entry.base)), - ignore: false, - should_panic: test::ShouldPanic::No, - }, - testfn: test::TestFn::dyn_test_fn(move || run_one(entry)), - } - -} - -fn main() { - test::test_main( - &std::env::args().collect::>(), - parse_test_data(include_str!("urltestdata.txt")).into_iter().map(make_test).collect(), - ) + Some(test::TestDescAndFn { + desc: test::TestDesc { + name: test::DynTestName(format!("{:?} @ base {:?}", input, base)), + ignore: false, + should_panic: test::ShouldPanic::No, + }, + testfn: test::TestFn::dyn_test_fn(move || run_one(input, base, expected)), + }) + }).collect(); + test::test_main(&std::env::args().collect::>(), tests) } From f59870f5d645518058f9fce6f3f721273625404d Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 11 Feb 2016 18:04:06 +0100 Subject: [PATCH 19/89] Remove Url::has_host Use .host().is_some() instead. --- src/lib.rs | 16 ++++------------ src/slicing.rs | 6 +++--- 2 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index bbbd2cb1..fd0e8203 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -232,7 +232,7 @@ impl Url { /// Return the password for this URL, if any, as a percent-encoded ASCII string. pub fn password(&self) -> Option<&str> { if self.byte_at(self.username_end) == b':' { - debug_assert!(self.has_host()); + debug_assert!(self.host().is_some()); debug_assert!(self.byte_at(self.host_start - 1) == b'@'); Some(self.slice(self.username_end + 1..self.host_start - 1)) } else { @@ -240,14 +240,6 @@ impl Url { } } - /// Return whether this URL has a host. - /// - /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs don’ - #[inline] - pub fn has_host(&self) -> bool { - !matches!(self.host, HostInternal::None) - } - /// Return the string representation of the host (domain or IP address) for this URL, if any. /// Non-ASCII domains are punycode-encoded per IDNA. /// @@ -256,10 +248,10 @@ impl Url { /// /// See also the `host` method. pub fn host_str(&self) -> Option<&str> { - if self.has_host() { - Some(self.slice(self.host_start..self.host_end)) - } else { + if matches!(self.host, HostInternal::None) { None + } else { + Some(self.slice(self.host_start..self.host_end)) } } diff --git a/src/slicing.rs b/src/slicing.rs index 63697942..4dc8e852 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -58,7 +58,7 @@ impl Index> for Url { /// ```notrust /// url = /// scheme ":" -/// [ "//" [ username [ ":" password ]? "@" ]? host [ ":" port ]? ] +/// [ "//" [ username [ ":" password ]? "@" ]? host [ ":" port ]? ]? /// path [ "?" query ]? [ "#" fragment ]? /// ``` /// @@ -116,7 +116,7 @@ impl Url { Position::AfterUsername => self.username_end as usize, Position::BeforePassword => if self.port.is_some() { - debug_assert!(self.has_host()); + debug_assert!(self.host().is_some()); debug_assert!(self.byte_at(self.username_end) == b':'); self.username_end as usize + ":".len() } else { @@ -125,7 +125,7 @@ impl Url { }, Position::AfterPassword => if self.port.is_some() { - debug_assert!(self.has_host()); + debug_assert!(self.host().is_some()); debug_assert!(self.byte_at(self.username_end) == b':'); debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); self.host_start as usize - "@".len() From 42b57d398ab856a4a6a871a92e4239ed670a89ab Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 12 Feb 2016 17:24:56 +0100 Subject: [PATCH 20/89] Remove unused ParseError variants --- src/host.rs | 6 +++--- src/parser.rs | 26 ++++---------------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/src/host.rs b/src/host.rs index fce48bd3..2124b1ab 100644 --- a/src/host.rs +++ b/src/host.rs @@ -161,7 +161,7 @@ fn longest_zero_sequence(pieces: &[u16; 8]) -> (isize, isize) { } -fn parse_ipv4number(mut input: &str) -> ParseResult { +fn parse_ipv4number(mut input: &str) -> Result { let mut r = 10; if input.starts_with("0x") || input.starts_with("0X") { input = &input[2..]; @@ -174,11 +174,11 @@ fn parse_ipv4number(mut input: &str) -> ParseResult { return Ok(0); } if input.starts_with("+") { - return Err(ParseError::InvalidIpv4Address) + return Err(()) } match u32::from_str_radix(&input, r) { Ok(number) => Ok(number), - Err(_) => Err(ParseError::InvalidIpv4Address), + Err(_) => Err(()), } } diff --git a/src/parser.rs b/src/parser.rs index c7fef178..3db1fd78 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -44,30 +44,12 @@ macro_rules! simple_enum_error { simple_enum_error! { EmptyHost => "empty host", IdnaError => "invalid international domain name", - InvalidScheme => "invalid scheme", InvalidPort => "invalid port number", InvalidIpv4Address => "invalid IPv4 address", InvalidIpv6Address => "invalid IPv6 address", InvalidDomainCharacter => "invalid domain character", - InvalidCharacter => "invalid character", - InvalidBackslash => "invalid backslash", - InvalidPercentEncoded => "invalid percent-encoded sequence", - InvalidAtSymbolInUser => "invalid @-symbol in user", - ExpectedTwoSlashes => "expected two slashes (//)", - ExpectedInitialSlash => "expected the input to start with a slash", - NonUrlCodePoint => "non URL code point", - RelativeUrlWithScheme => "relative URL with scheme", RelativeUrlWithoutBase => "relative URL without a base", RelativeUrlWithNonRelativeBase => "relative URL with a non-relative base", - NonAsciiDomainsNotSupportedYet => "non-ASCII domains are not supported yet", - CannotSetJavascriptFragment => "cannot set fragment on javascript: URL", - CannotSetPortWithFileLikeScheme => "cannot set port with file-like scheme", - CannotSetUsernameWithNonRelativeScheme => "cannot set username with non-relative scheme", - CannotSetPasswordWithNonRelativeScheme => "cannot set password with non-relative scheme", - CannotSetHostPortWithNonRelativeScheme => "cannot set host and port with non-relative scheme", - CannotSetHostWithNonRelativeScheme => "cannot set host with non-relative scheme", - CannotSetPortWithNonRelativeScheme => "cannot set port with non-relative scheme", - CannotSetPathWithNonRelativeScheme => "cannot set path with non-relative scheme", Overflow => "URLs more than 4 GB are not supported", } @@ -174,9 +156,9 @@ impl<'a> Parser<'a> { } } - pub fn parse_scheme<'i>(&mut self, input: &'i str, context: Context) -> ParseResult<&'i str> { + pub fn parse_scheme<'i>(&mut self, input: &'i str, context: Context) -> Result<&'i str, ()> { if input.is_empty() || !input.starts_with(ascii_alpha) { - return Err(ParseError::InvalidScheme) + return Err(()) } debug_assert!(self.serialization.is_empty()); for (i, c) in input.char_indices() { @@ -187,7 +169,7 @@ impl<'a> Parser<'a> { ':' => return Ok(&input[i + 1..]), _ => { self.serialization.clear(); - return Err(ParseError::InvalidScheme) + return Err(()) } } } @@ -196,7 +178,7 @@ impl<'a> Parser<'a> { Context::Setter => Ok(""), Context::UrlParser => { self.serialization.clear(); - Err(ParseError::InvalidScheme) + Err(()) } } } From 194da72e7df3d1ed81ffbf8254744cc70229bd55 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 11 Feb 2016 18:12:30 +0100 Subject: [PATCH 21/89] Make context a field of Parser. --- src/lib.rs | 1 + src/parser.rs | 54 +++++++++++++++++++++++++-------------------------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index fd0e8203..83346520 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -199,6 +199,7 @@ impl Url { base_url: base_url, query_encoding_override: encoding_override, log_syntax_violation: log_syntax_violation, + context: parser::Context::UrlParser, }.parse_url(input) } diff --git a/src/parser.rs b/src/parser.rs index 3db1fd78..a4a2dafe 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -63,12 +63,6 @@ impl From<::idna::uts46::Errors> for ParseError { fn from(_: ::idna::uts46::Errors) -> ParseError { ParseError::IdnaError } } -#[derive(PartialEq, Eq)] -pub enum Context { - UrlParser, - Setter, -} - #[derive(Copy, Clone)] pub enum SchemeType { File, @@ -109,6 +103,13 @@ pub struct Parser<'a> { pub base_url: Option<&'a Url>, pub query_encoding_override: EncodingOverride, pub log_syntax_violation: Option<&'a Fn(&'static str)>, + pub context: Context, +} + +#[derive(PartialEq, Eq)] +pub enum Context { + UrlParser, + Setter, } impl<'a> Parser<'a> { @@ -133,7 +134,7 @@ impl<'a> Parser<'a> { if input.len() < original_input.len() { self.syntax_violation("leading or trailing control or space character") } - if let Ok(remaining) = self.parse_scheme(input, Context::UrlParser) { + if let Ok(remaining) = self.parse_scheme(input) { return self.parse_with_scheme(remaining) } @@ -156,7 +157,7 @@ impl<'a> Parser<'a> { } } - pub fn parse_scheme<'i>(&mut self, input: &'i str, context: Context) -> Result<&'i str, ()> { + pub fn parse_scheme<'i>(&mut self, input: &'i str) -> Result<&'i str, ()> { if input.is_empty() || !input.starts_with(ascii_alpha) { return Err(()) } @@ -174,7 +175,7 @@ impl<'a> Parser<'a> { } } // EOF before ':' - match context { + match self.context { Context::Setter => Ok(""), Context::UrlParser => { self.serialization.clear(); @@ -234,7 +235,7 @@ impl<'a> Parser<'a> { let remaining = if relative { let path_start = self.serialization.len(); self.serialization.push('/'); - self.parse_path(scheme_type, &mut false, path_start, &input[1..], Context::UrlParser) + self.parse_path(scheme_type, &mut false, path_start, &input[1..]) } else { self.parse_non_relative_path(input) }; @@ -356,13 +357,11 @@ impl<'a> Parser<'a> { let host_end = try!(to_u32(self.serialization.len())); let mut has_host = !matches!(host, HostInternal::None); let remaining = if path_start { - self.parse_path_start( - SchemeType::File, &mut has_host, remaining, Context::UrlParser) + self.parse_path_start(SchemeType::File, &mut has_host, remaining) } else { let path_start = self.serialization.len(); self.serialization.push('/'); - self.parse_path(SchemeType::File, &mut has_host, path_start, - remaining, Context::UrlParser) + self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) }; // FIXME: deal with has_host let (query_start, fragment_start) = @@ -393,7 +392,7 @@ impl<'a> Parser<'a> { } } let remaining = self.parse_path( - SchemeType::File, &mut false, path_start, input, Context::UrlParser); + SchemeType::File, &mut false, path_start, input); let (query_start, fragment_start) = try!(self.parse_query_and_fragment(scheme_end, remaining)); let path_start = path_start as u32; @@ -425,8 +424,7 @@ impl<'a> Parser<'a> { self.serialization.push_str(before_query); self.pop_path(SchemeType::File, base_url.path_start as usize); let remaining = self.parse_path( - SchemeType::File, &mut true, base_url.path_start as usize, - input, Context::UrlParser); + SchemeType::File, &mut true, base_url.path_start as usize, input); let non_relative = false; self.with_query_and_fragment( non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -436,7 +434,7 @@ impl<'a> Parser<'a> { let scheme_end = "file".len() as u32; let path_start = "file://".len(); let remaining = self.parse_path( - SchemeType::File, &mut false, path_start, input, Context::UrlParser); + SchemeType::File, &mut false, path_start, input); let (query_start, fragment_start) = try!(self.parse_query_and_fragment(scheme_end, remaining)); let path_start = path_start as u32; @@ -507,7 +505,7 @@ impl<'a> Parser<'a> { debug_assert!(base_url.byte_at(path_start) == b'/'); self.serialization.push_str(base_url.slice(..path_start + 1)); let remaining = self.parse_path( - scheme_type, &mut true, path_start as usize, &input[1..], Context::UrlParser); + scheme_type, &mut true, path_start as usize, &input[1..]); let non_relative = false; self.with_query_and_fragment( non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -523,7 +521,7 @@ impl<'a> Parser<'a> { // FIXME spec says just "remove last entry", not the "pop" algorithm self.pop_path(scheme_type, base_url.path_start as usize); let remaining = self.parse_path( - scheme_type, &mut true, base_url.path_start as usize, input, Context::UrlParser); + scheme_type, &mut true, base_url.path_start as usize, input); let non_relative = false; self.with_query_and_fragment( non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, @@ -546,7 +544,7 @@ impl<'a> Parser<'a> { // path state let path_start = try!(to_u32(self.serialization.len())); let remaining = self.parse_path_start( - scheme_type, &mut true, remaining, Context::UrlParser); + scheme_type, &mut true, remaining); self.with_query_and_fragment(non_relative, scheme_end, username_end, host_start, host_end, host, port, path_start, remaining) } @@ -732,7 +730,7 @@ impl<'a> Parser<'a> { } fn parse_path_start<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, - mut input: &'i str, context: Context) + mut input: &'i str) -> &'i str { // Path start state let mut iter = input.chars(); @@ -746,11 +744,11 @@ impl<'a> Parser<'a> { } let path_start = self.serialization.len(); self.serialization.push('/'); - self.parse_path(scheme_type, has_host, path_start, input, context) + self.parse_path(scheme_type, has_host, path_start, input) } fn parse_path<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, - path_start: usize, input: &'i str, context: Context) + path_start: usize, input: &'i str) -> &'i str { // Relative path state debug_assert!(self.serialization.ends_with("/")); @@ -773,7 +771,7 @@ impl<'a> Parser<'a> { end = i; break }, - '?' | '#' if context == Context::UrlParser => { + '?' | '#' if self.context == Context::UrlParser => { end = i; break }, @@ -893,7 +891,7 @@ impl<'a> Parser<'a> { Some('?') => { query_start = Some(try!(to_u32(self.serialization.len()))); self.serialization.push('?'); - let remaining = self.parse_query(scheme_end, &input[1..], Context::UrlParser); + let remaining = self.parse_query(scheme_end, &input[1..]); if let Some(remaining) = remaining { input = remaining } else { @@ -912,13 +910,13 @@ impl<'a> Parser<'a> { Ok((query_start, Some(fragment_start))) } - pub fn parse_query<'i>(&mut self, scheme_end: u32, input: &'i str, context: Context) + pub fn parse_query<'i>(&mut self, scheme_end: u32, input: &'i str) -> Option<&'i str> { let mut query = String::new(); // FIXME: use a streaming decoder instead let mut remaining = None; for (i, c) in input.char_indices() { match c { - '#' if context == Context::UrlParser => { + '#' if self.context == Context::UrlParser => { remaining = Some(&input[i..]); break }, From 31bde79556453f7f0afb2bd56870363200f97643 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 15 Feb 2016 15:05:53 +0100 Subject: [PATCH 22/89] Remove the redundant is_relative field. --- src/lib.rs | 17 +++++++---------- src/parser.rs | 30 +++++++++--------------------- src/slicing.rs | 7 +++---- 3 files changed, 19 insertions(+), 35 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 83346520..529a3a87 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -157,7 +157,6 @@ pub mod form_urlencoded; #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub struct Url { serialization: String, - non_relative: bool, // Components scheme_end: u32, // Before ':' @@ -166,7 +165,7 @@ pub struct Url { host_end: u32, host: HostInternal, port: Option, - path_start: u32, // Before initial '/' if !non_relative + path_start: u32, // Before initial '/', if any query_start: Option, // Before '?', unlike Position::QueryStart fragment_start: Option, // Before '#', unlike Position::FragmentStart } @@ -208,7 +207,7 @@ impl Url { &self.serialization } - /// Return the scheme of this URL, as an ASCII string without the ':' delimiter. + /// Return the scheme of this URL, lower-cased, as an ASCII string without the ':' delimiter. #[inline] pub fn scheme(&self) -> &str { self.slice(..self.scheme_end) @@ -217,7 +216,7 @@ impl Url { /// Return whether this URL is non-relative (typical of e.g. `data:` and `mailto:` URLs.) #[inline] pub fn non_relative(&self) -> bool { - self.non_relative + self.byte_at(self.path_start) != b'/' } /// Return the username for this URL (typically the empty string) @@ -309,12 +308,11 @@ impl Url { /// /// Return `None` for non-relative URLs, or an iterator of at least one string. pub fn path_segments(&self) -> Option> { - if self.non_relative { - None - } else { - let path = self.path(); - debug_assert!(path.starts_with("/")); + let path = self.path(); + if path.starts_with('/') { Some(path[1..].split('/')) + } else { + None } } @@ -354,7 +352,6 @@ impl Url { try!(path_to_file_url_segments(path.as_ref(), &mut serialization)); Ok(Url { serialization: serialization, - non_relative: false, scheme_end: "file".len() as u32, username_end: path_start, host_start: path_start, diff --git a/src/parser.rs b/src/parser.rs index a4a2dafe..3ce11cfe 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -142,7 +142,7 @@ impl<'a> Parser<'a> { if let Some(base_url) = self.base_url { if input.starts_with("#") { self.fragment_only(base_url, input) - } else if base_url.non_relative { + } else if base_url.non_relative() { Err(ParseError::RelativeUrlWithNonRelativeBase) } else { let scheme_type = SchemeType::from(base_url.scheme()); @@ -204,7 +204,7 @@ impl<'a> Parser<'a> { if slashes_count < 2 && base_url.scheme() == &self.serialization[..scheme_end as usize] { // Non-relative URLs only happen with "not special" schemes. - debug_assert!(!base_url.non_relative); + debug_assert!(!base_url.non_relative()); self.serialization.clear(); return self.parse_relative(input, scheme_type, base_url) } @@ -231,15 +231,14 @@ impl<'a> Parser<'a> { let host_end = path_start; let host = HostInternal::None; let port = None; - let relative = input.starts_with("/"); - let remaining = if relative { + let remaining = if input.starts_with("/") { let path_start = self.serialization.len(); self.serialization.push('/'); self.parse_path(scheme_type, &mut false, path_start, &input[1..]) } else { self.parse_non_relative_path(input) }; - self.with_query_and_fragment(!relative, scheme_end, username_end, host_start, + self.with_query_and_fragment(scheme_end, username_end, host_start, host_end, host, port, path_start, remaining) } @@ -267,7 +266,6 @@ impl<'a> Parser<'a> { let path_start = "file://".len() as u32; Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: path_start, host_start: path_start, @@ -305,7 +303,6 @@ impl<'a> Parser<'a> { try!(self.parse_query_and_fragment(scheme_end, input)); Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: path_start, host_start: path_start, @@ -329,7 +326,6 @@ impl<'a> Parser<'a> { self.parse_fragment(&input[1..]); Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: path_start, host_start: path_start, @@ -368,7 +364,6 @@ impl<'a> Parser<'a> { try!(self.parse_query_and_fragment(scheme_end, remaining)); Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: host_start, host_start: host_start, @@ -398,7 +393,6 @@ impl<'a> Parser<'a> { let path_start = path_start as u32; Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: path_start, host_start: path_start, @@ -425,9 +419,8 @@ impl<'a> Parser<'a> { self.pop_path(SchemeType::File, base_url.path_start as usize); let remaining = self.parse_path( SchemeType::File, &mut true, base_url.path_start as usize, input); - let non_relative = false; self.with_query_and_fragment( - non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.scheme_end, base_url.username_end, base_url.host_start, base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) } else { self.serialization.push_str("file:///"); @@ -440,7 +433,6 @@ impl<'a> Parser<'a> { let path_start = path_start as u32; Ok(Url { serialization: self.serialization, - non_relative: false, scheme_end: scheme_end, username_end: path_start, host_start: path_start, @@ -506,9 +498,8 @@ impl<'a> Parser<'a> { self.serialization.push_str(base_url.slice(..path_start + 1)); let remaining = self.parse_path( scheme_type, &mut true, path_start as usize, &input[1..]); - let non_relative = false; self.with_query_and_fragment( - non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.scheme_end, base_url.username_end, base_url.host_start, base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) } _ => { @@ -522,9 +513,8 @@ impl<'a> Parser<'a> { self.pop_path(scheme_type, base_url.path_start as usize); let remaining = self.parse_path( scheme_type, &mut true, base_url.path_start as usize, input); - let non_relative = false; self.with_query_and_fragment( - non_relative, base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.scheme_end, base_url.username_end, base_url.host_start, base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) } } @@ -534,7 +524,6 @@ impl<'a> Parser<'a> { -> ParseResult { self.serialization.push('/'); self.serialization.push('/'); - let non_relative = false; // authority state let (username_end, remaining) = try!(self.parse_userinfo(input, scheme_type)); // host state @@ -545,7 +534,7 @@ impl<'a> Parser<'a> { let path_start = try!(to_u32(self.serialization.len())); let remaining = self.parse_path_start( scheme_type, &mut true, remaining); - self.with_query_and_fragment(non_relative, scheme_end, username_end, host_start, + self.with_query_and_fragment(scheme_end, username_end, host_start, host_end, host, port, path_start, remaining) } @@ -861,7 +850,7 @@ impl<'a> Parser<'a> { "" } - fn with_query_and_fragment(mut self, non_relative: bool, scheme_end: u32, username_end: u32, + fn with_query_and_fragment(mut self, scheme_end: u32, username_end: u32, host_start: u32, host_end: u32, host: HostInternal, port: Option, path_start: u32, remaining: &str) -> ParseResult { @@ -869,7 +858,6 @@ impl<'a> Parser<'a> { try!(self.parse_query_and_fragment(scheme_end, remaining)); Ok(Url { serialization: self.serialization, - non_relative: non_relative, scheme_end: scheme_end, username_end: username_end, host_start: host_start, diff --git a/src/slicing.rs b/src/slicing.rs index 4dc8e852..665cb6c3 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -104,13 +104,12 @@ impl Url { Position::AfterScheme => self.scheme_end as usize, - Position::BeforeUsername => if self.non_relative { + Position::BeforeUsername => if self.slice(self.scheme_end..).starts_with("://") { + self.scheme_end as usize + "://".len() + } else { debug_assert!(self.byte_at(self.scheme_end) == b':'); debug_assert!(self.scheme_end + ":".len() as u32 == self.username_end); self.scheme_end as usize + ":".len() - } else { - debug_assert!(self.slice(self.scheme_end..).starts_with("://")); - self.scheme_end as usize + "://".len() }, Position::AfterUsername => self.username_end as usize, From 5364f2b19fb06584d7d0df80fc35e79e705e7bea Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 15 Feb 2016 16:05:28 +0100 Subject: [PATCH 23/89] Add Url::domain and Url::ip_address --- Cargo.toml | 1 + build.rs | 17 +++++++++++++++++ src/lib.rs | 26 ++++++++++++++++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 build.rs diff --git a/Cargo.toml b/Cargo.toml index 31804048..1848b2b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ repository = "https://github.com/servo/rust-url" readme = "README.md" keywords = ["url", "parser"] license = "MIT/Apache-2.0" +build = "build.rs" [[test]] name = "form_urlencoded" diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..0d005ee7 --- /dev/null +++ b/build.rs @@ -0,0 +1,17 @@ +use std::process::{Command, Stdio}; +use std::io::Write; + +fn main() { + let mut child = Command::new(option_env!("RUSTC").unwrap_or("rustc")) + .args(&["-", "--crate-type", "lib", "-Z", "no-trans"]) + .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .unwrap(); + child.stdin.as_mut().unwrap().write_all(b"use std::net::IpAddr;").unwrap(); + if child.wait().unwrap().success() { + // We can use `IpAddr` as it is `#[stable]` in this version of Rust. + println!("cargo:rustc-cfg=has_ipaddr") + } +} diff --git a/src/lib.rs b/src/lib.rs index 529a3a87..dbea2d68 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,6 +130,7 @@ use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; use std::cmp; use std::fmt; use std::hash; +#[cfg(has_ipaddr)] use std::net::IpAddr; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; @@ -241,7 +242,9 @@ impl Url { } /// Return the string representation of the host (domain or IP address) for this URL, if any. + /// /// Non-ASCII domains are punycode-encoded per IDNA. + /// IPv6 addresses are given between `[` and `]` brackets. /// /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs /// don’t have a host. @@ -271,6 +274,29 @@ impl Url { } } + /// If this URL has a host and it is a domain name (not an IP address), return it. + pub fn domain(&self) -> Option<&str> { + match self.host { + HostInternal::None => None, + HostInternal::Domain => Some(self.slice(self.host_start..self.host_end)), + HostInternal::Ipv4(_) => None, + HostInternal::Ipv6(_) => None, + } + } + + /// If this URL has a host and it is an IP address (not a domain name), return it. + /// + /// This does **not** resolve domain names. + #[cfg(has_ipaddr)] + pub fn ip_address(&self) -> Option { + match self.host { + HostInternal::None => None, + HostInternal::Domain => None, + HostInternal::Ipv4(address) => Some(IpAddr::V4(address)), + HostInternal::Ipv6(address) => Some(IpAddr::V6(address)), + } + } + /// Return the port number for this URL, if any. #[inline] pub fn port(&self) -> Option { From 1afe54fa41f29b98dba3757b7af496b403f53512 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 15 Feb 2016 23:35:25 +0100 Subject: [PATCH 24/89] Implement ToSocketAddrs --- src/host.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++- src/lib.rs | 68 ++++++++++++++++++++++++++++++++++++++++++++++----- src/origin.rs | 2 +- 3 files changed, 127 insertions(+), 8 deletions(-) diff --git a/src/host.rs b/src/host.rs index 2124b1ab..f147c208 100644 --- a/src/host.rs +++ b/src/host.rs @@ -8,7 +8,9 @@ use std::cmp; use std::fmt::{self, Formatter, Write}; -use std::net::{Ipv4Addr, Ipv6Addr}; +use std::io; +use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs}; +use std::vec; use parser::{ParseResult, ParseError}; use percent_encoding::lossy_utf8_percent_decode; use idna; @@ -44,6 +46,7 @@ pub enum Host { } impl<'a> Host<&'a str> { + /// Return a copy of `self` that owns an allocated `String` but does not borrow an `&Url`. pub fn to_owned(&self) -> Host { match *self { Host::Domain(domain) => Host::Domain(domain.to_owned()), @@ -93,6 +96,66 @@ impl> fmt::Display for Host { } } +/// This mostly exists because coherence rules don’t allow us to implement +/// `ToSocketAddrs for (Host, u16)`. +pub struct HostAndPort { + pub host: Host, + pub port: u16, +} + +impl<'a> HostAndPort<&'a str> { + /// Return a copy of `self` that owns an allocated `String` but does not borrow an `&Url`. + pub fn to_owned(&self) -> HostAndPort { + HostAndPort { + host: self.host.to_owned(), + port: self.port + } + } +} + +impl> ToSocketAddrs for HostAndPort { + type Iter = SocketAddrs; + + fn to_socket_addrs(&self) -> io::Result { + let port = self.port; + match self.host { + Host::Domain(ref domain) => Ok(SocketAddrs { + state: SocketAddrsState::Domain(try!((domain.as_ref(), port).to_socket_addrs())) + }), + Host::Ipv4(address) => Ok(SocketAddrs { + state: SocketAddrsState::One(SocketAddr::V4(SocketAddrV4::new(address, port))) + }), + Host::Ipv6(address) => Ok(SocketAddrs { + state: SocketAddrsState::One(SocketAddr::V6(SocketAddrV6::new(address, port, 0, 0))) + }), + } + } +} + +pub struct SocketAddrs { + state: SocketAddrsState +} + +enum SocketAddrsState { + Domain(vec::IntoIter), + One(SocketAddr), + Done, +} + +impl Iterator for SocketAddrs { + type Item = SocketAddr; + fn next(&mut self) -> Option { + match self.state { + SocketAddrsState::Domain(ref mut iter) => iter.next(), + SocketAddrsState::One(s) => { + self.state = SocketAddrsState::Done; + Some(s) + } + SocketAddrsState::Done => None + } + } +} + /// Parse `input` as a host. /// If successful, write its serialization to `serialization` /// and return the internal representation for `Url`. diff --git a/src/lib.rs b/src/lib.rs index dbea2d68..d0d1d306 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -130,14 +130,15 @@ use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; use std::cmp; use std::fmt; use std::hash; -#[cfg(has_ipaddr)] use std::net::IpAddr; +use std::io; +use std::net::ToSocketAddrs; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; pub use encoding::EncodingOverride; pub use origin::Origin; -pub use host::Host; +pub use host::{Host, HostAndPort, SocketAddrs}; pub use parser::ParseError; pub use slicing::Position; pub use webidl::WebIdl; @@ -288,12 +289,12 @@ impl Url { /// /// This does **not** resolve domain names. #[cfg(has_ipaddr)] - pub fn ip_address(&self) -> Option { + pub fn ip_address(&self) -> Option { match self.host { HostInternal::None => None, HostInternal::Domain => None, - HostInternal::Ipv4(address) => Some(IpAddr::V4(address)), - HostInternal::Ipv6(address) => Some(IpAddr::V6(address)), + HostInternal::Ipv4(address) => Some(net::IpAddr::V4(address)), + HostInternal::Ipv6(address) => Some(net::IpAddr::V6(address)), } } @@ -311,10 +312,52 @@ impl Url { /// For URLs in these schemes, this method always returns `Some(_)`. /// For other schemes, it is the same as `Url::port()`. #[inline] - pub fn port_or_default(&self) -> Option { + pub fn port_or_known_default(&self) -> Option { self.port.or_else(|| parser::default_port(self.scheme())) } + /// If the URL has a host, return something that implements `ToSocketAddrs`. + /// + /// If the URL has no port number and the scheme’s default port number is not known + /// (see `Url::port_or_known_default`), + /// the closure is called to obtain a port number. + /// Typically, this closure can match on the result `Url::scheme` + /// to have per-scheme default port numbers, + /// and panic for schemes it’s not prepared to handle. + /// For example: + /// + /// ```rust + /// # use url::Url; + /// # use std::net::TcpStream; + /// # use std::io; + /// + /// fn connect(url: &Url) -> io::Result { + /// TcpStream::connect(try!(url.with_default_port(default_port))) + /// } + /// + /// fn default_port(url: &Url) -> Result { + /// match url.scheme() { + /// "git" => Ok(9418), + /// "git+ssh" => Ok(22), + /// "git+https" => Ok(443), + /// "git+http" => Ok(80), + /// _ => Err(()), + /// } + /// } + /// ``` + pub fn with_default_port(&self, f: F) -> io::Result> + where F: FnOnce(&Url) -> Result { + Ok(HostAndPort { + host: try!(self.host() + .ok_or(()) + .or_else(|()| io_error("URL has no host"))), + port: try!(self.port_or_known_default() + .ok_or(()) + .or_else(|()| f(self)) + .or_else(|()| io_error("URL has no port number"))) + }) + } + /// Return the path for this URL, as a percent-encoded ASCII string. /// For relative URLs, this starts with a '/' slash /// and continues with slash-separated path segments. @@ -463,6 +506,15 @@ impl Url { } } +/// Return an error if `Url::host` or `Url::port_or_known_default` return `None`. +impl ToSocketAddrs for Url { + type Iter = SocketAddrs; + + fn to_socket_addrs(&self) -> io::Result { + try!(self.with_default_port(|_| Err(()))).to_socket_addrs() + } +} + /// Parse a string as an URL, without a base URL or encoding override. impl str::FromStr for Url { type Err = ParseError; @@ -695,3 +747,7 @@ fn file_url_segments_to_pathbuf_windows(mut segments: str::Split) -> Resul "to_file_path() failed to produce an absolute Path"); Ok(path) } + +fn io_error(reason: &str) -> io::Result { + Err(io::Error::new(io::ErrorKind::InvalidData, reason)) +} diff --git a/src/origin.rs b/src/origin.rs index 10011c42..ffe2e98b 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -23,7 +23,7 @@ impl Url { }, "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { Origin::Tuple(scheme.to_owned(), self.host().unwrap().to_owned(), - self.port_or_default().unwrap()) + self.port_or_known_default().unwrap()) }, // TODO: Figure out what to do if the scheme is a file "file" => Origin::new_opaque(), From 21db81f0fbe5dacb373483bb489b70a77856779c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 15 Feb 2016 23:53:02 +0100 Subject: [PATCH 25/89] Remove Url::ip_address for now Reconsider when IpAddr is stable. --- Cargo.toml | 1 - build.rs | 17 ----------------- src/host.rs | 1 + src/lib.rs | 18 +----------------- 4 files changed, 2 insertions(+), 35 deletions(-) delete mode 100644 build.rs diff --git a/Cargo.toml b/Cargo.toml index 1848b2b6..31804048 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,6 @@ repository = "https://github.com/servo/rust-url" readme = "README.md" keywords = ["url", "parser"] license = "MIT/Apache-2.0" -build = "build.rs" [[test]] name = "form_urlencoded" diff --git a/build.rs b/build.rs deleted file mode 100644 index 0d005ee7..00000000 --- a/build.rs +++ /dev/null @@ -1,17 +0,0 @@ -use std::process::{Command, Stdio}; -use std::io::Write; - -fn main() { - let mut child = Command::new(option_env!("RUSTC").unwrap_or("rustc")) - .args(&["-", "--crate-type", "lib", "-Z", "no-trans"]) - .stdin(Stdio::piped()) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .unwrap(); - child.stdin.as_mut().unwrap().write_all(b"use std::net::IpAddr;").unwrap(); - if child.wait().unwrap().success() { - // We can use `IpAddr` as it is `#[stable]` in this version of Rust. - println!("cargo:rustc-cfg=has_ipaddr") - } -} diff --git a/src/host.rs b/src/host.rs index f147c208..5eb01a63 100644 --- a/src/host.rs +++ b/src/host.rs @@ -120,6 +120,7 @@ impl> ToSocketAddrs for HostAndPort { let port = self.port; match self.host { Host::Domain(ref domain) => Ok(SocketAddrs { + // FIXME: use std::net::lookup_host when it’s stable. state: SocketAddrsState::Domain(try!((domain.as_ref(), port).to_socket_addrs())) }), Host::Ipv4(address) => Ok(SocketAddrs { diff --git a/src/lib.rs b/src/lib.rs index d0d1d306..4e446704 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -145,7 +145,6 @@ pub use webidl::WebIdl; mod encoding; mod host; -mod idna_mapping; mod origin; mod parser; mod slicing; @@ -278,23 +277,8 @@ impl Url { /// If this URL has a host and it is a domain name (not an IP address), return it. pub fn domain(&self) -> Option<&str> { match self.host { - HostInternal::None => None, HostInternal::Domain => Some(self.slice(self.host_start..self.host_end)), - HostInternal::Ipv4(_) => None, - HostInternal::Ipv6(_) => None, - } - } - - /// If this URL has a host and it is an IP address (not a domain name), return it. - /// - /// This does **not** resolve domain names. - #[cfg(has_ipaddr)] - pub fn ip_address(&self) -> Option { - match self.host { - HostInternal::None => None, - HostInternal::Domain => None, - HostInternal::Ipv4(address) => Some(net::IpAddr::V4(address)), - HostInternal::Ipv6(address) => Some(net::IpAddr::V6(address)), + _ => None, } } From 1a22a90ae5bc8e5e1a32dca6fc2c0d8db932a842 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 16 Feb 2016 17:08:52 +0100 Subject: [PATCH 26/89] Add Unicode and ASCII serializations of origins --- src/origin.rs | 42 ++++++++++++++++++++++++++++++++++++++++-- src/webidl.rs | 6 +++--- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/src/origin.rs b/src/origin.rs index ffe2e98b..0d00680f 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -6,8 +6,11 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use Url; use host::Host; +use idna::domain_to_unicode; +use parser::default_port; +use std::sync::Arc; +use Url; impl Url { /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) @@ -46,7 +49,42 @@ pub enum Origin { impl Origin { /// Creates a new opaque origin that is only equal to itself. pub fn new_opaque() -> Origin { - Origin::Opaque(OpaqueOrigin(Box::new(0))) + Origin::Opaque(OpaqueOrigin(Arc::new(0))) + } + + /// https://html.spec.whatwg.org/multipage/#ascii-serialisation-of-an-origin + pub fn ascii_serialization(&self) -> String { + match *self { + Origin::Opaque(_) => "null".to_owned(), + Origin::Tuple(ref scheme, ref host, port) => { + if default_port(scheme) == Some(port) { + format!("{}://{}", scheme, host) + } else { + format!("{}://{}:{}", scheme, host, port) + } + } + } + } + + /// https://html.spec.whatwg.org/multipage/#unicode-serialisation-of-an-origin + pub fn unicode_serialization(&self) -> String { + match *self { + Origin::Opaque(_) => "null".to_owned(), + Origin::Tuple(ref scheme, ref host, port) => { + let host = match *host { + Host::Domain(ref domain) => { + let (domain, _errors) = domain_to_unicode(domain); + Host::Domain(domain) + } + _ => host.clone() + }; + if default_port(scheme) == Some(port) { + format!("{}://{}", scheme, host) + } else { + format!("{}://{}:{}", scheme, host, port) + } + } + } } } diff --git a/src/webidl.rs b/src/webidl.rs index 9361538d..035748c6 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -31,9 +31,9 @@ impl WebIdl { Ok(()) } - /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-origin - pub fn origin(_url: &Url) -> String { - unimplemented!() // FIXME + /// Getter for https://url.spec.whatwg.org/#dom-url-origin + pub fn origin(url: &Url) -> String { + url.origin().unicode_serialization() } /// Getter for https://url.spec.whatwg.org/#dom-url-protocol From 8eba0ce1060b196fdd87d371ddd514cfcb47dff2 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 16 Feb 2016 17:09:11 +0100 Subject: [PATCH 27/89] Test WebIdl::origin CC https://github.com/w3c/web-platform-tests/pull/2584 --- tests/urltestdata.json | 24 ------------------------ tests/wpt.rs | 13 ++++++++----- 2 files changed, 8 insertions(+), 29 deletions(-) diff --git a/tests/urltestdata.json b/tests/urltestdata.json index 4ea27d73..2c7d344f 100644 --- a/tests/urltestdata.json +++ b/tests/urltestdata.json @@ -919,7 +919,6 @@ "input": "file:/example.com/", "base": "http://example.org/foo/bar", "href": "file:///example.com/", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1295,7 +1294,6 @@ "input": "file:c:\\foo\\bar.html", "base": "file:///tmp/mock/path", "href": "file:///c:/foo/bar.html", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1310,7 +1308,6 @@ "input": " File:c|////foo\\bar.html", "base": "file:///tmp/mock/path", "href": "file:///c:////foo/bar.html", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1325,7 +1322,6 @@ "input": "C|/foo/bar", "base": "file:///tmp/mock/path", "href": "file:///C:/foo/bar", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1340,7 +1336,6 @@ "input": "/C|\\foo\\bar", "base": "file:///tmp/mock/path", "href": "file:///C:/foo/bar", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1355,7 +1350,6 @@ "input": "//C|/foo/bar", "base": "file:///tmp/mock/path", "href": "file:///C:/foo/bar", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1370,7 +1364,6 @@ "input": "//server/file", "base": "file:///tmp/mock/path", "href": "file://server/file", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1385,7 +1378,6 @@ "input": "\\\\server\\file", "base": "file:///tmp/mock/path", "href": "file://server/file", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1400,7 +1392,6 @@ "input": "/\\server/file", "base": "file:///tmp/mock/path", "href": "file://server/file", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1415,7 +1406,6 @@ "input": "file:///foo/bar.txt", "base": "file:///tmp/mock/path", "href": "file:///foo/bar.txt", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1430,7 +1420,6 @@ "input": "file:///home/me", "base": "file:///tmp/mock/path", "href": "file:///home/me", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1445,7 +1434,6 @@ "input": "//", "base": "file:///tmp/mock/path", "href": "file:///", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1460,7 +1448,6 @@ "input": "///", "base": "file:///tmp/mock/path", "href": "file:///", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1475,7 +1462,6 @@ "input": "///test", "base": "file:///tmp/mock/path", "href": "file:///test", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1490,7 +1476,6 @@ "input": "file://test", "base": "file:///tmp/mock/path", "href": "file://test/", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1505,7 +1490,6 @@ "input": "file://localhost", "base": "file:///tmp/mock/path", "href": "file:///", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1520,7 +1504,6 @@ "input": "file://localhost/", "base": "file:///tmp/mock/path", "href": "file:///", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1535,7 +1518,6 @@ "input": "file://localhost/test", "base": "file:///tmp/mock/path", "href": "file:///test", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1550,7 +1532,6 @@ "input": "test", "base": "file:///tmp/mock/path", "href": "file:///tmp/mock/test", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -1565,7 +1546,6 @@ "input": "file:test", "base": "file:///tmp/mock/path", "href": "file:///tmp/mock/test", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -2587,7 +2567,6 @@ "input": "file:/example.com/", "base": "about:blank", "href": "file:///example.com/", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -3344,7 +3323,6 @@ "input": "file:...", "base": "http://www.example.com/test", "href": "file:///...", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -3359,7 +3337,6 @@ "input": "file:..", "base": "http://www.example.com/test", "href": "file:///", - "origin": "file://", "protocol": "file:", "username": "", "password": "", @@ -3374,7 +3351,6 @@ "input": "file:a", "base": "http://www.example.com/test", "href": "file:///a", - "origin": "file://", "protocol": "file:", "username": "", "password": "", diff --git a/tests/wpt.rs b/tests/wpt.rs index 18e43aca..b7254938 100644 --- a/tests/wpt.rs +++ b/tests/wpt.rs @@ -29,17 +29,20 @@ fn run_one(input: String, base: String, expected: Result) { }; macro_rules! assert_getter { - ($attribute: ident) => { + ($attribute: ident) => { assert_getter!($attribute, expected.$attribute) }; + ($attribute: ident, $expected: expr) => { { let a = WebIdl::$attribute(&url); - let b = expected.$attribute; + let b = $expected; assert!(a == b, "{:?} != {:?} for URL {:?}", a, b, url); } } } assert_getter!(href); - //assert_getter!(origin); FIXME + if let Some(expected_origin) = expected.origin { + assert_getter!(origin, expected_origin); + } assert_getter!(protocol); assert_getter!(username); assert_getter!(password); @@ -53,7 +56,7 @@ fn run_one(input: String, base: String, expected: Result) { struct TestCase { href: String, - origin: String, + origin: Option, protocol: String, username: String, password: String, @@ -80,7 +83,7 @@ fn main() { } else { Ok(TestCase { href: string("href"), - origin: string("origin"), + origin: entry.find("origin").map(|j| j.as_string().unwrap().to_owned()), protocol: string("protocol"), username: string("username"), password: string("password"), From 44b601b453912cf16ce1d0792adeccd0cebf78d6 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 11 Feb 2016 18:33:07 +0100 Subject: [PATCH 28/89] Add a fragment setter --- src/lib.rs | 36 +++++++++++++++++++++++++++++++----- src/parser.rs | 2 +- src/webidl.rs | 12 +++++++++--- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 4e446704..a43093cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -126,11 +126,13 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") extern crate idna; use host::HostInternal; +use parser::{Parser, Context}; use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; use std::cmp; use std::fmt; use std::hash; use std::io; +use std::mem; use std::net::ToSocketAddrs; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; @@ -139,7 +141,7 @@ use std::str; pub use encoding::EncodingOverride; pub use origin::Origin; pub use host::{Host, HostAndPort, SocketAddrs}; -pub use parser::ParseError; +pub use parser::{ParseError, to_u32}; pub use slicing::Position; pub use webidl::WebIdl; @@ -194,12 +196,12 @@ impl Url { encoding_override: EncodingOverride, log_syntax_violation: Option<&Fn(&'static str)>) -> Result { - parser::Parser { + Parser { serialization: String::with_capacity(input.len()), base_url: base_url, query_encoding_override: encoding_override, log_syntax_violation: log_syntax_violation, - context: parser::Context::UrlParser, + context: Context::UrlParser, }.parse_url(input) } @@ -386,8 +388,8 @@ impl Url { /// Return this URL’s fragment identifier, if any. /// - /// **Note:** the parser does *not* percent-encode this component, - /// but the input may be percent-encoded already. + /// **Note:** the parser did *not* percent-encode this component, + /// but the input may have been percent-encoded already. pub fn fragment(&self) -> Option<&str> { self.fragment_start.map(|start| { debug_assert!(self.byte_at(start) == b'#'); @@ -395,6 +397,30 @@ impl Url { }) } + fn mutate(&mut self, f: F) { + let mut parser = Parser { + serialization: mem::replace(&mut self.serialization, String::new()), + base_url: None, + query_encoding_override: EncodingOverride::utf8(), + log_syntax_violation: None, + context: Context::Setter, + }; + f(&mut parser); + self.serialization = parser.serialization; + } + + /// Change this URL’s fragment identifier. + pub fn set_fragment(&mut self, fragment: Option<&str>) { + if let Some(start) = self.fragment_start { + debug_assert!(self.byte_at(start) == b'#'); + self.serialization.truncate(start as usize); + } + if let Some(input) = fragment { + self.serialization.push('#'); + self.mutate(|parser| parser.parse_fragment(input)); + } + } + /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// /// This returns `Err` if the given path is not absolute or, diff --git a/src/parser.rs b/src/parser.rs index 3ce11cfe..b2be2280 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1053,7 +1053,7 @@ pub fn ascii_alpha(ch: char) -> bool { } #[inline] -fn to_u32(i: usize) -> ParseResult { +pub fn to_u32(i: usize) -> ParseResult { if i <= ::std::u32::MAX as usize { Ok(i as u32) } else { diff --git a/src/webidl.rs b/src/webidl.rs index 035748c6..2a129be6 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -161,8 +161,14 @@ impl WebIdl { } } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-hash - pub fn set_hash(_url: &mut Url, _new_hash: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-hash + pub fn set_hash(url: &mut Url, new_hash: &str) { + if url.scheme() != "javascript" { + url.set_fragment(match new_hash { + "" => None, + _ if new_hash.starts_with('#') => Some(&new_hash[1..]), + _ => Some(new_hash), + }) + } } } From 0ac1ac59bb41767e543c183c4c902b4ca13087ff Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 12 Feb 2016 18:29:28 +0100 Subject: [PATCH 29/89] Add a query setter. --- src/lib.rs | 40 +++++++++++++++++++++++++++++++++++++--- src/webidl.rs | 10 +++++++--- 2 files changed, 44 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a43093cd..2dd7f6b9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -397,7 +397,7 @@ impl Url { }) } - fn mutate(&mut self, f: F) { + fn mutate R, R>(&mut self, f: F) -> R { let mut parser = Parser { serialization: mem::replace(&mut self.serialization, String::new()), base_url: None, @@ -405,19 +405,53 @@ impl Url { log_syntax_violation: None, context: Context::Setter, }; - f(&mut parser); + let result = f(&mut parser); self.serialization = parser.serialization; + result } /// Change this URL’s fragment identifier. pub fn set_fragment(&mut self, fragment: Option<&str>) { + // Remove any previous fragment if let Some(start) = self.fragment_start { debug_assert!(self.byte_at(start) == b'#'); self.serialization.truncate(start as usize); } + // Write the new one if let Some(input) = fragment { + self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); self.serialization.push('#'); - self.mutate(|parser| parser.parse_fragment(input)); + self.mutate(|parser| parser.parse_fragment(input)) + } else { + self.fragment_start = None + } + } + + /// Change this URL’s query string. + pub fn set_query(&mut self, query: Option<&str>) { + // Stash any fragment + let fragment = self.fragment_start.map(|start| { + let f = self.slice(start..).to_owned(); + self.serialization.truncate(start as usize); + f + }); + // Remove any previous query + if let Some(start) = self.query_start { + debug_assert!(self.byte_at(start) == b'?'); + self.serialization.truncate(start as usize); + } + // Write the new one + if let Some(input) = query { + self.query_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('?'); + let scheme_end = self.scheme_end; + self.mutate(|parser| parser.parse_query(scheme_end, input)); + } + // Restore the fragment, if any + if let Some(ref fragment) = fragment { + self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + debug_assert!(fragment.starts_with('#')); + self.serialization.push_str(fragment) // It’s already been through the parser } } diff --git a/src/webidl.rs b/src/webidl.rs index 2a129be6..2027dd60 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -139,9 +139,13 @@ impl WebIdl { } } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-search - pub fn set_search(_url: &mut Url, _new_search: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-search + pub fn set_search(url: &mut Url, new_search: &str) { + url.set_query(match new_search { + "" => None, + _ if new_search.starts_with('?') => Some(&new_search[1..]), + _ => Some(new_search), + }) } /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-searchparams From ec1e55df3f3dacfcb13c0472d9b6193ed0b60210 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 16:51:09 +0100 Subject: [PATCH 30/89] Make Url::parse_with usable. (EncodingOverride is private.) --- src/encoding.rs | 9 +++++++++ src/lib.rs | 23 +++++++++++++---------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index be53ea19..1679a55a 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -27,6 +27,10 @@ pub struct EncodingOverride { #[cfg(feature = "query_encoding")] impl EncodingOverride { + pub fn from_parse_options(options: &::ParseOptions) -> EncodingOverride { + EncodingOverride::from_opt_encoding(options.encoding_override) + } + pub fn from_opt_encoding(encoding: Option) -> EncodingOverride { encoding.map(EncodingOverride::from_encoding).unwrap_or_else(EncodingOverride::utf8) } @@ -76,6 +80,11 @@ pub struct EncodingOverride; #[cfg(not(feature = "query_encoding"))] impl EncodingOverride { + #[inline] + pub fn from_parse_options(_options: &::ParseOptions) -> EncodingOverride { + EncodingOverride + } + #[inline] pub fn utf8() -> EncodingOverride { EncodingOverride diff --git a/src/lib.rs b/src/lib.rs index 2dd7f6b9..8d084706 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -173,17 +173,24 @@ pub struct Url { fragment_start: Option, // Before '#', unlike Position::FragmentStart } +#[derive(Default)] +pub struct ParseOptions<'a> { + pub base_url: Option<&'a Url>, + #[cfg(feature = "query_encoding")] pub encoding_override: Option, + pub log_syntax_violation: Option<&'a Fn(&'static str)>, +} + impl Url { /// Parse an absolute URL from a string. #[inline] pub fn parse(input: &str) -> Result { - Url::parse_with(input, None, EncodingOverride::utf8(), None) + Url::parse_with(input, ParseOptions::default()) } /// Parse a string as an URL, with this URL as the base URL. #[inline] pub fn join(&self, input: &str) -> Result { - Url::parse_with(input, Some(self), EncodingOverride::utf8(), None) + Url::parse_with(input, ParseOptions { base_url: Some(self), ..Default::default() }) } /// The URL parser with all of its parameters. @@ -191,16 +198,12 @@ impl Url { /// `encoding_override` is a legacy concept only relevant for HTML. /// When it’s not needed, /// `s.parse::()`, `Url::from_str(s)` and `url.join(s)` can be used instead. - pub fn parse_with(input: &str, - base_url: Option<&Url>, - encoding_override: EncodingOverride, - log_syntax_violation: Option<&Fn(&'static str)>) - -> Result { + pub fn parse_with(input: &str, options: ParseOptions) -> Result { Parser { serialization: String::with_capacity(input.len()), - base_url: base_url, - query_encoding_override: encoding_override, - log_syntax_violation: log_syntax_violation, + base_url: options.base_url, + query_encoding_override: EncodingOverride::from_parse_options(&options), + log_syntax_violation: options.log_syntax_violation, context: Context::UrlParser, }.parse_url(input) } From 0e5e27c911dbec237bf5d6ced025ae8b78fda411 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 16:55:30 +0100 Subject: [PATCH 31/89] Add Origin::is_tuple --- src/origin.rs | 4 ++++ src/slicing.rs | 1 + 2 files changed, 5 insertions(+) diff --git a/src/origin.rs b/src/origin.rs index 0d00680f..f686768c 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -52,6 +52,10 @@ impl Origin { Origin::Opaque(OpaqueOrigin(Arc::new(0))) } + pub fn is_tuple(&self) -> bool { + matches!(*self, Origin::Tuple(..)) + } + /// https://html.spec.whatwg.org/multipage/#ascii-serialisation-of-an-origin pub fn ascii_serialization(&self) -> String { match *self { diff --git a/src/slicing.rs b/src/slicing.rs index 665cb6c3..502d11ff 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -77,6 +77,7 @@ impl Index> for Url { /// `BeforeScheme` and `AfterFragment` are always the start and end of the entire URL, /// so `&url[BeforeScheme..X]` is the same as `&url[..X]` /// and `&url[X..AfterFragment]` is the same as `&url[X..]`. +#[derive(Copy, Clone, Debug)] pub enum Position { BeforeScheme, AfterScheme, From 1813290382a1238079c1fb7479f051849a6d8cd0 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 21:33:10 +0100 Subject: [PATCH 32/89] More consistent checks for URL with authority or path-only. --- src/lib.rs | 29 ++++++++++++++++++++++++----- src/slicing.rs | 14 +++++++------- 2 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 8d084706..5e6044a6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -159,6 +159,15 @@ pub mod form_urlencoded; #[derive(Clone)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub struct Url { + /// Syntax in pseudo-BNF: + /// + /// url = scheme ":" [ hierarchical | non-hierarchical ] [ "?" query ]? [ "#" fragment ]? + /// non-hierarchical = non-hierarchical-path + /// non-hierarchical-path = /* Does not start with "/" */ + /// hierarchical = authority? hierarchical-path + /// authority = "//" userinfo? host [ ":" port ]? + /// userinfo = username [ ":" password ]? "@" + /// hierarchical-path = [ "/" path-segment ]+ serialization: String, // Components @@ -219,6 +228,13 @@ impl Url { self.slice(..self.scheme_end) } + /// Return whether the URL has a host. + #[inline] + pub fn has_host(&self) -> bool { + debug_assert!(self.byte_at(self.scheme_end) == b':'); + self.slice(self.scheme_end + 1 ..).starts_with("//") + } + /// Return whether this URL is non-relative (typical of e.g. `data:` and `mailto:` URLs.) #[inline] pub fn non_relative(&self) -> bool { @@ -228,7 +244,7 @@ impl Url { /// Return the username for this URL (typically the empty string) /// as a percent-encoded ASCII string. pub fn username(&self) -> &str { - if self.slice(self.scheme_end..).starts_with("://") { + if self.has_host() { self.slice(self.scheme_end + 3..self.username_end) } else { "" @@ -237,8 +253,11 @@ impl Url { /// Return the password for this URL, if any, as a percent-encoded ASCII string. pub fn password(&self) -> Option<&str> { + // This ':' is not the one marking a port number since a host can not be empty. + // (Except for file: URLs, which do not have port numbers.) if self.byte_at(self.username_end) == b':' { - debug_assert!(self.host().is_some()); + debug_assert!(self.has_host()); + debug_assert!(self.host_start < self.host_end); debug_assert!(self.byte_at(self.host_start - 1) == b'@'); Some(self.slice(self.username_end + 1..self.host_start - 1)) } else { @@ -256,10 +275,10 @@ impl Url { /// /// See also the `host` method. pub fn host_str(&self) -> Option<&str> { - if matches!(self.host, HostInternal::None) { - None - } else { + if self.has_host() { Some(self.slice(self.host_start..self.host_end)) + } else { + None } } diff --git a/src/slicing.rs b/src/slicing.rs index 502d11ff..94d6c38d 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -105,7 +105,7 @@ impl Url { Position::AfterScheme => self.scheme_end as usize, - Position::BeforeUsername => if self.slice(self.scheme_end..).starts_with("://") { + Position::BeforeUsername => if self.has_host() { self.scheme_end as usize + "://".len() } else { debug_assert!(self.byte_at(self.scheme_end) == b':'); @@ -115,18 +115,18 @@ impl Url { Position::AfterUsername => self.username_end as usize, - Position::BeforePassword => if self.port.is_some() { - debug_assert!(self.host().is_some()); - debug_assert!(self.byte_at(self.username_end) == b':'); + Position::BeforePassword => if self.byte_at(self.username_end) == b':' { + debug_assert!(self.has_host()); + debug_assert!(self.host_start < self.host_end); self.username_end as usize + ":".len() } else { debug_assert!(self.username_end == self.host_start); self.username_end as usize }, - Position::AfterPassword => if self.port.is_some() { - debug_assert!(self.host().is_some()); - debug_assert!(self.byte_at(self.username_end) == b':'); + Position::AfterPassword => if self.byte_at(self.username_end) == b':' { + debug_assert!(self.has_host()); + debug_assert!(self.host_start < self.host_end); debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); self.host_start as usize - "@".len() } else { From aaa1540a45b5e2d18c8303b1ab2ac6d666a14b7b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 21:34:14 +0100 Subject: [PATCH 33/89] Re-export OpaqueOrigin. It is exposed publicly through Origin::Opaque --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 5e6044a6..da1dae82 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -139,7 +139,7 @@ use std::path::{Path, PathBuf}; use std::str; pub use encoding::EncodingOverride; -pub use origin::Origin; +pub use origin::{Origin, OpaqueOrigin}; pub use host::{Host, HostAndPort, SocketAddrs}; pub use parser::{ParseError, to_u32}; pub use slicing::Position; From c9e687cf2f04e1d6769dbd4d90b38f9ad7d9ec68 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 21:44:06 +0100 Subject: [PATCH 34/89] Add a scheme setter --- src/lib.rs | 45 ++++++++++++++++++++++++++++++++++++++------- src/parser.rs | 10 ++++++++++ src/webidl.rs | 6 +++--- 3 files changed, 51 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index da1dae82..e36f729c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -420,13 +420,7 @@ impl Url { } fn mutate R, R>(&mut self, f: F) -> R { - let mut parser = Parser { - serialization: mem::replace(&mut self.serialization, String::new()), - base_url: None, - query_encoding_override: EncodingOverride::utf8(), - log_syntax_violation: None, - context: Context::Setter, - }; + let mut parser = Parser::for_setter(mem::replace(&mut self.serialization, String::new())); let result = f(&mut parser); self.serialization = parser.serialization; result @@ -477,6 +471,43 @@ impl Url { } } + /// Change this URL’s scheme. + /// + /// Do nothing and return `Err` if: + /// * The new scheme is not in `[a-zA-Z][a-zA-Z0-9+.-]+` + /// * This URL is non-relative and the new scheme is one of + /// `http`, `https`, `ws`, `wss`, `ftp`, or `gopher` + pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ()> { + self.set_scheme_internal(scheme, false) + } + + fn set_scheme_internal(&mut self, scheme: &str, allow_extra_input_after_colon: bool) + -> Result<(), ()> { + let mut parser = Parser::for_setter(String::new()); + let remaining = try!(parser.parse_scheme(scheme)); + if !(remaining.is_empty() || allow_extra_input_after_colon) { + return Err(()) + } + let old_scheme_end = self.scheme_end; + let new_scheme_end = to_u32(parser.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_scheme_end; + *index += new_scheme_end; + }; + + self.scheme_end = new_scheme_end; + adjust(&mut self.username_end); + adjust(&mut self.host_start); + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + + parser.serialization.push_str(self.slice(old_scheme_end..)); + self.serialization = parser.serialization; + Ok(()) + } + /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// /// This returns `Err` if the given path is not absolute or, diff --git a/src/parser.rs b/src/parser.rs index b2be2280..6a04fc75 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -113,6 +113,16 @@ pub enum Context { } impl<'a> Parser<'a> { + pub fn for_setter(serialization: String) -> Parser<'a> { + Parser { + serialization: serialization, + base_url: None, + query_encoding_override: EncodingOverride::utf8(), + log_syntax_violation: None, + context: Context::Setter, + } + } + fn syntax_violation(&self, reason: &'static str) { if let Some(log) = self.log_syntax_violation { log(reason) diff --git a/src/webidl.rs b/src/webidl.rs index 2027dd60..3f86b64e 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -43,9 +43,9 @@ impl WebIdl { url.slice(..url.scheme_end + 1) } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-protocol - pub fn set_protocol(_url: &mut Url, _new_protocol: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-protocol + pub fn set_protocol(url: &mut Url, new_protocol: &str) { + let _ = url.set_scheme_internal(new_protocol, true); } /// Getter for https://url.spec.whatwg.org/#dom-url-username From 0c30434e7e288263a4e66e7d4aa971a343bac040 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 19 Feb 2016 22:40:38 +0100 Subject: [PATCH 35/89] Add host setters. --- src/lib.rs | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 96 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e36f729c..f73bf16a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -129,11 +129,11 @@ use host::HostInternal; use parser::{Parser, Context}; use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; use std::cmp; -use std::fmt; +use std::fmt::{self, Write}; use std::hash; use std::io; use std::mem; -use std::net::ToSocketAddrs; +use std::net::{ToSocketAddrs, Ipv4Addr, Ipv6Addr}; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; @@ -471,6 +471,100 @@ impl Url { } } + /// Change this URL’s host. + /// + /// If this URL is non-relative, do nothing and return `Err`. + /// + /// Removing the host (calling this with `None`) + /// will also remove any username, password, and port number. + pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + + if let Some(host) = host { + self.set_host_internal(try!(Host::parse(host).map_err(|_| ()))) + } else if self.has_host() { + // Not debug_assert! since this proves that `unsafe` below is OK: + assert!(self.byte_at(self.scheme_end) == b':'); + assert!(self.byte_at(self.path_start) == b'/'); + let new_path_start = self.scheme_end + 1; + unsafe { + self.serialization.as_mut_vec() + .drain(self.path_start as usize..new_path_start as usize); + } + let offset = self.path_start - new_path_start; + self.path_start = new_path_start; + self.username_end = new_path_start; + self.host_start = new_path_start; + self.host_end = new_path_start; + self.port = None; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) + } + + fn set_host_internal(&mut self, host: Host) { + let after_host = self.slice(self.host_end..).to_owned(); + self.serialization.truncate(self.host_start as usize); + if !self.has_host() { + debug_assert!(self.slice(self.scheme_end..self.host_start) == ":"); + debug_assert!(self.username_end == self.host_start); + self.serialization.push('/'); + self.serialization.push('/'); + self.username_end += 2; + self.host_start += 2; + } + let old_host_end = self.host_end; + write!(&mut self.serialization, "{}", host).unwrap(); + let new_host_end = to_u32(self.serialization.len()).unwrap(); + self.serialization.push_str(&after_host); + + self.host = match host { + Host::Domain(_) => HostInternal::Domain, + Host::Ipv4(address) => HostInternal::Ipv4(address), + Host::Ipv6(address) => HostInternal::Ipv6(address), + }; + self.host_end = new_host_end; + let adjust = |index: &mut u32| { + *index -= old_host_end; + *index += new_host_end; + }; + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + } + + /// Change this URL’s host to the given IPv4 address. + /// + /// If this URL is non-relative, do nothing and return `Err`. + /// + /// Compared to `Url::set_host`, this skips the host parser. + pub fn set_ipv4_host(&mut self, address: Ipv4Addr) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + + self.set_host_internal(Host::Ipv4(address)); + Ok(()) + } + + /// Change this URL’s host to the given IPv6 address. + /// + /// If this URL is non-relative, do nothing and return `Err`. + /// + /// Compared to `Url::set_host`, this skips the host parser. + pub fn set_ipv6_host(&mut self, address: Ipv6Addr) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + + self.set_host_internal(Host::Ipv6(address)); + Ok(()) + } + /// Change this URL’s scheme. /// /// Do nothing and return `Err` if: From f8f9176bd76255b8e0df4b45a421179aa9edf7d4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 23 Feb 2016 14:57:29 +0100 Subject: [PATCH 36/89] More setters Fix #154 (by adding `Url::push_path_segment`) --- src/lib.rs | 103 ++++++++++++++++++++++++++++++++++++++++ src/parser.rs | 32 ++++++++----- src/percent_encoding.rs | 4 +- 3 files changed, 124 insertions(+), 15 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index f73bf16a..a42b2da4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -471,6 +471,109 @@ impl Url { } } + /// Remove the last segment of this URL’s path. + /// + /// If this URL is non-relative, do nothing and return `Err`. + pub fn pop_path_segment(&mut self) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + let last_slash; + let path_len; + { + let path = self.path(); + last_slash = path.rfind('/').unwrap(); + path_len = path.len(); + }; + if last_slash > 0 { + // Found a slash other than the initial one + let last_slash = last_slash + self.path_start as usize; + let path_end = path_len + self.path_start as usize; + unsafe { + self.serialization.as_mut_vec().drain(last_slash..path_end); + } + let offset = (path_end - last_slash) as u32; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) + } + + /// Add a segment at the end of this URL’s path. + /// + /// If this URL is non-relative, do nothing and return `Err`. + pub fn push_path_segment(&mut self, segment: &str) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + let after_path = match (self.query_start, self.fragment_start) { + (Some(i), _) | (None, Some(i)) => { + let s = self.slice(i..).to_owned(); + self.serialization.truncate(i as usize); + Some(s) + }, + (None, None) => None + }; + let scheme_type = parser::SchemeType::from(self.scheme()); + let path_start = self.path_start as usize; + self.serialization.push('/'); + self.mutate(|parser| { + parser.context = parser::Context::PathSegmentSetter; + let mut has_host = true; // FIXME account for this? + parser.parse_path(scheme_type, &mut has_host, path_start, segment) + }); + let offset = to_u32(self.serialization.len()).unwrap() - self.path_start; + if let Some(ref mut index) = self.query_start { *index += offset } + if let Some(ref mut index) = self.fragment_start { *index += offset } + if let Some(ref after_path) = after_path { + self.serialization.push_str(after_path) + } + Ok(()) + } + + /// Change this URL’s port number. + /// + /// If this URL is non-relative, does not have a host, or has the `file` scheme; + /// do nothing and return `Err`. + pub fn set_port(&mut self, mut port: Option) -> Result<(), ()> { + if self.non_relative() { + return Err(()) + } + if port.is_some() && port == parser::default_port(self.scheme()) { + port = None + } + match (self.port, port) { + (None, None) => {} + (Some(_), None) => { + unsafe { + self.serialization.as_mut_vec().drain( + self.host_end as usize .. self.path_start as usize); + } + let offset = self.path_start - self.host_end; + self.path_start = self.host_end; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + (Some(old), Some(new)) if old == new => {} + (_, Some(new)) => { + let path_and_after = self.slice(self.path_start..).to_owned(); + self.serialization.truncate(self.host_end as usize); + write!(&mut self.serialization, ":{}", new).unwrap(); + let old_path_start = self.path_start; + let new_path_start = to_u32(self.serialization.len()).unwrap(); + self.path_start = new_path_start; + let adjust = |index: &mut u32| { + *index -= old_path_start; + *index += new_path_start; + }; + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + self.serialization.push_str(&path_and_after); + } + } + Ok(()) + } + /// Change this URL’s host. /// /// If this URL is non-relative, do nothing and return `Err`. diff --git a/src/parser.rs b/src/parser.rs index 6a04fc75..0627ad95 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -14,7 +14,8 @@ use super::{Url, EncodingOverride}; use host::{self, HostInternal}; use percent_encoding::{ utf8_percent_encode, percent_encode, - SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET + SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET, + PATH_SEGMENT_ENCODE_SET }; pub type ParseResult = Result; @@ -79,7 +80,7 @@ impl SchemeType { matches!(*self, SchemeType::File) } - fn from(s: &str) -> Self { + pub fn from(s: &str) -> Self { match s { "http" | "https" | "ws" | "wss" | "ftp" | "gopher" => SchemeType::SpecialNotFile, "file" => SchemeType::File, @@ -110,6 +111,7 @@ pub struct Parser<'a> { pub enum Context { UrlParser, Setter, + PathSegmentSetter, } impl<'a> Parser<'a> { @@ -185,12 +187,11 @@ impl<'a> Parser<'a> { } } // EOF before ':' - match self.context { - Context::Setter => Ok(""), - Context::UrlParser => { - self.serialization.clear(); - Err(()) - } + if self.context == Context::Setter { + Ok("") + } else { + self.serialization.clear(); + Err(()) } } @@ -746,9 +747,9 @@ impl<'a> Parser<'a> { self.parse_path(scheme_type, has_host, path_start, input) } - fn parse_path<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, - path_start: usize, input: &'i str) - -> &'i str { + pub fn parse_path<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + path_start: usize, input: &'i str) + -> &'i str { // Relative path state debug_assert!(self.serialization.ends_with("/")); let mut iter = input.char_ranges(); @@ -786,8 +787,13 @@ impl<'a> Parser<'a> { } iter = after_percent_sign } - self.serialization.extend(utf8_percent_encode( - &input[i..next_i], DEFAULT_ENCODE_SET)); + if self.context == Context::PathSegmentSetter { + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], PATH_SEGMENT_ENCODE_SET)); + } else { + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], DEFAULT_ENCODE_SET)); + } } } } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index c3ebc34a..d9a26563 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -97,8 +97,8 @@ define_encode_set! { } define_encode_set! { - /// This encode set is used for username and password. - pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%'} + /// This encode set is used for on '/'-separated path segment + pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%', '/'} } define_encode_set! { From 5425385451eebe803c57026abcc66af10205806c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 26 Feb 2016 16:11:29 +0100 Subject: [PATCH 37/89] Add a path setter --- src/lib.rs | 40 +++++++++++++++++++++++++++++++++++----- src/parser.rs | 11 ++++++----- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a42b2da4..2093641d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -471,6 +471,38 @@ impl Url { } } + /// Change this URL’s path. + pub fn set_path(&mut self, path: &str) { + let (old_after_path_pos, after_path) = match (self.query_start, self.fragment_start) { + (Some(i), _) | (None, Some(i)) => (i, self.slice(i..).to_owned()), + (None, None) => (to_u32(self.serialization.len()).unwrap(), String::new()) + }; + let non_relative = self.non_relative(); + let scheme_type = parser::SchemeType::from(self.scheme()); + self.serialization.truncate(self.path_start as usize); + self.mutate(|parser| { + if non_relative { + if path.starts_with('/') { + parser.serialization.push_str("%2F"); + parser.parse_non_relative_path(&path[1..]); + } else { + parser.parse_non_relative_path(path); + } + } else { + let mut has_host = true; // FIXME + parser.parse_path_start(scheme_type, &mut has_host, path); + } + }); + let new_after_path_pos = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_after_path_pos; + *index += new_after_path_pos; + }; + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + self.serialization.push_str(&after_path) + } + /// Remove the last segment of this URL’s path. /// /// If this URL is non-relative, do nothing and return `Err`. @@ -510,9 +542,9 @@ impl Url { (Some(i), _) | (None, Some(i)) => { let s = self.slice(i..).to_owned(); self.serialization.truncate(i as usize); - Some(s) + s }, - (None, None) => None + (None, None) => String::new() }; let scheme_type = parser::SchemeType::from(self.scheme()); let path_start = self.path_start as usize; @@ -525,9 +557,7 @@ impl Url { let offset = to_u32(self.serialization.len()).unwrap() - self.path_start; if let Some(ref mut index) = self.query_start { *index += offset } if let Some(ref mut index) = self.fragment_start { *index += offset } - if let Some(ref after_path) = after_path { - self.serialization.push_str(after_path) - } + self.serialization.push_str(&after_path); Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index 0627ad95..0fa15ac4 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -729,7 +729,7 @@ impl<'a> Parser<'a> { return Ok((opt_port, &input[end..])) } - fn parse_path_start<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + pub fn parse_path_start<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, mut input: &'i str) -> &'i str { // Path start state @@ -760,12 +760,13 @@ impl<'a> Parser<'a> { end = input.len(); while let Some((i, c, next_i)) = iter.next() { match c { - '/' => { + '/' if self.context != Context::PathSegmentSetter => { ends_with_slash = true; end = i; break }, - '\\' if scheme_type.is_special() => { + '\\' if self.context != Context::PathSegmentSetter && + scheme_type.is_special() => { self.syntax_violation("backslash"); ends_with_slash = true; end = i; @@ -851,10 +852,10 @@ impl<'a> Parser<'a> { } - fn parse_non_relative_path<'i>(&mut self, input: &'i str) -> &'i str { + pub fn parse_non_relative_path<'i>(&mut self, input: &'i str) -> &'i str { for (i, c, next_i) in input.char_ranges() { match c { - '?' | '#' => return &input[i..], + '?' | '#' if self.context == Context::UrlParser => return &input[i..], '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), _ => { self.check_url_code_point(input, i, c); From fc9a1db6378b08ccf07c11c2d4a5c79708839220 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 26 Feb 2016 19:56:06 +0100 Subject: [PATCH 38/89] Username and passowrd setters --- src/lib.rs | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 88 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 2093641d..3317066a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -127,7 +127,8 @@ extern crate idna; use host::HostInternal; use parser::{Parser, Context}; -use percent_encoding::{PATH_SEGMENT_ENCODE_SET, percent_encode, percent_decode}; +use percent_encoding::{PATH_SEGMENT_ENCODE_SET, USERINFO_ENCODE_SET, + percent_encode, percent_decode, utf8_percent_encode}; use std::cmp; use std::fmt::{self, Write}; use std::hash; @@ -664,7 +665,6 @@ impl Url { *index -= old_host_end; *index += new_host_end; }; - adjust(&mut self.host_end); adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { adjust(index) } if let Some(ref mut index) = self.fragment_start { adjust(index) } @@ -698,6 +698,92 @@ impl Url { Ok(()) } + /// Change this URL’s password. + /// + /// If this URL is non-relative or does not have a host, do nothing and return `Err`. + pub fn set_password(&mut self, password: Option<&str>) -> Result<(), ()> { + if !self.has_host() { + return Err(()) + } + if let Some(password) = password { + let host_and_after = self.slice(self.host_start..).to_owned(); + self.serialization.truncate(self.username_end as usize); + self.serialization.push(':'); + self.serialization.extend(utf8_percent_encode(password, USERINFO_ENCODE_SET)); + self.serialization.push('@'); + + let old_host_start = self.host_start; + let new_host_start = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_host_start; + *index += new_host_start; + }; + self.host_start = new_host_start; + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + + self.serialization.push_str(&host_and_after); + } else if self.byte_at(self.username_end) == b':' { // If there is a password to remove + let has_username_or_password = self.byte_at(self.host_start - 1) == b'@'; + debug_assert!(has_username_or_password); + let username_start = self.scheme_end + 3; + let empty_username = username_start == self.username_end; + let start = self.username_end; // Remove the ':' + let end = if empty_username { + self.host_start // Remove the '@' as well + } else { + self.host_start - 1 // Keep the '@' to separate the username from the host + }; + unsafe { + self.serialization.as_mut_vec().drain(start as usize .. end as usize); + } + let offset = end - start; + self.host_start -= offset; + self.host_end -= offset; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) + } + + /// Change this URL’s username. + /// + /// If this URL is non-relative or does not have a host, do nothing and return `Err`. + pub fn set_username(&mut self, username: &str) -> Result<(), ()> { + if !self.has_host() { + return Err(()) + } + let username_start = self.scheme_end + 3; + if self.slice(username_start..self.username_end) == username { + return Ok(()) + } + let after_username = self.slice(self.username_end..).to_owned(); + self.serialization.truncate(username_start as usize); + self.serialization.extend(utf8_percent_encode(username, USERINFO_ENCODE_SET)); + + let old_username_end = self.username_end; + let new_username_end = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_username_end; + *index += new_username_end; + }; + + self.username_end = new_username_end; + adjust(&mut self.host_start); + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + + if !after_username.starts_with(|c| matches!(c, '@' | ':')) { + self.serialization.push('@'); + } + self.serialization.push_str(&after_username); + Ok(()) + } + /// Change this URL’s scheme. /// /// Do nothing and return `Err` if: From 8306485a7c2ba6dacffcaece567f2bae68b4a468 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 26 Feb 2016 20:00:33 +0100 Subject: [PATCH 39/89] More WebIDL implementations. --- src/webidl.rs | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/src/webidl.rs b/src/webidl.rs index 3f86b64e..fa60b8cd 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -7,19 +7,30 @@ // except according to those terms. use {Url, ParseError}; +use host::Host; +use idna::domain_to_unicode; /// https://url.spec.whatwg.org/#api pub struct WebIdl; impl WebIdl { - /// **Not implemented yet** https://url.spec.whatwg.org/#dom-url-domaintoascii - pub fn domain_to_ascii(_domain: &str) -> String { - unimplemented!() // FIXME + /// https://url.spec.whatwg.org/#dom-url-domaintoascii + pub fn domain_to_ascii(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(domain)) => domain, + _ => String::new(), + } } - /// **Not implemented yet** https://url.spec.whatwg.org/#dom-url-domaintounicode - pub fn domain_to_unicode(_domain: &str) -> String { - unimplemented!() // FIXME + /// https://url.spec.whatwg.org/#dom-url-domaintounicode + pub fn domain_to_unicode(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(ref domain)) => { + let (unicode, _errors) = domain_to_unicode(domain); + unicode + } + _ => String::new(), + } } pub fn href(url: &Url) -> &str { @@ -54,9 +65,9 @@ impl WebIdl { url.username() } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-username - pub fn set_username(_url: &mut Url, _new_username: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-username + pub fn set_username(url: &mut Url, new_username: &str) { + let _ = url.set_username(new_username); } /// Getter for https://url.spec.whatwg.org/#dom-url-password @@ -65,9 +76,9 @@ impl WebIdl { url.password().unwrap_or("") } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-password - pub fn set_password(_url: &mut Url, _new_password: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-password + pub fn set_password(url: &mut Url, new_password: &str) { + let _ = url.set_password(if new_password.is_empty() { None } else { Some(new_password) }); } /// Getter for https://url.spec.whatwg.org/#dom-url-host @@ -115,9 +126,11 @@ impl WebIdl { url.path() } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-pathname - pub fn set_pathname(_url: &mut Url, _new_pathname: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-pathname + pub fn set_pathname(url: &mut Url, new_pathname: &str) { + if !url.non_relative() { + url.set_path(new_pathname) + } } /// Getter for https://url.spec.whatwg.org/#dom-url-search @@ -148,9 +161,9 @@ impl WebIdl { }) } - /// **Not implemented yet** Getter for https://url.spec.whatwg.org/#dom-url-searchparams - pub fn search_params(_url: &Url) -> Vec<(String, String)> { - unimplemented!(); // FIXME + /// Getter for https://url.spec.whatwg.org/#dom-url-searchparams + pub fn search_params(url: &Url) -> Vec<(String, String)> { + url.query_pairs().unwrap_or_else(Vec::new) } /// Getter for https://url.spec.whatwg.org/#dom-url-hash From 06f2fafa3f1c5fef9d2c7cb499e533927379058c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 1 Mar 2016 16:47:23 +0100 Subject: [PATCH 40/89] Port setters --- src/lib.rs | 8 ++++++-- src/parser.rs | 19 +++++++++++-------- src/webidl.rs | 18 +++++++++++++++--- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 3317066a..c0f081ac 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -567,12 +567,17 @@ impl Url { /// If this URL is non-relative, does not have a host, or has the `file` scheme; /// do nothing and return `Err`. pub fn set_port(&mut self, mut port: Option) -> Result<(), ()> { - if self.non_relative() { + if !self.has_host() || self.scheme() == "file" { return Err(()) } if port.is_some() && port == parser::default_port(self.scheme()) { port = None } + self.set_port_inner(port); + Ok(()) + } + + fn set_port_inner(&mut self, port: Option) { match (self.port, port) { (None, None) => {} (Some(_), None) => { @@ -602,7 +607,6 @@ impl Url { self.serialization.push_str(&path_and_after); } } - Ok(()) } /// Change this URL’s host. diff --git a/src/parser.rs b/src/parser.rs index 0fa15ac4..ba177d63 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -606,10 +606,15 @@ impl<'a> Parser<'a> { let (host, remaining) = try!(self.parse_host(input, scheme_type)); let host_end = try!(to_u32(self.serialization.len())); let (port, remaining) = if remaining.starts_with(":") { - try!(self.parse_port(&remaining[1..], scheme_end)) + let syntax_violation = |message| self.syntax_violation(message); + let scheme = || default_port(&self.serialization[..scheme_end as usize]); + try!(Parser::parse_port(&remaining[1..], syntax_violation, scheme)) } else { (None, remaining) }; + if let Some(port) = port { + write!(&mut self.serialization, ":{}", port).unwrap() + } Ok((host_end, host, port, remaining)) } @@ -696,8 +701,9 @@ impl<'a> Parser<'a> { Ok((true, host, &input[end..])) } - pub fn parse_port<'i>(&mut self, input: &'i str, scheme_end: u32) - -> ParseResult<(Option, &'i str)> { + pub fn parse_port<'i, V, P>(input: &'i str, syntax_violation: V, default_port: P) + -> ParseResult<(Option, &'i str)> + where V: Fn(&'static str), P: Fn() -> Option { let mut port = 0; let mut has_any_digit = false; let mut end = input.len(); @@ -714,17 +720,14 @@ impl<'a> Parser<'a> { end = i; break }, - '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + '\t' | '\n' | '\r' => syntax_violation("invalid character"), _ => return Err(ParseError::InvalidPort) } } } let mut opt_port = Some(port as u16); - if !has_any_digit || opt_port == default_port(&self.serialization[..scheme_end as usize]) { + if !has_any_digit || opt_port == default_port() { opt_port = None; - } else { - self.serialization.push(':'); - write!(&mut self.serialization, "{}", port).unwrap(); } return Ok((opt_port, &input[end..])) } diff --git a/src/webidl.rs b/src/webidl.rs index fa60b8cd..2ccb6766 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -9,6 +9,7 @@ use {Url, ParseError}; use host::Host; use idna::domain_to_unicode; +use parser::{Parser, default_port}; /// https://url.spec.whatwg.org/#api pub struct WebIdl; @@ -115,9 +116,20 @@ impl WebIdl { } } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-port - pub fn set_port(_url: &mut Url, _new_port: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-port + pub fn set_port(url: &mut Url, new_port: &str) { + let result; + { + // has_host implies !non_relative + let scheme = url.scheme(); + if !url.has_host() || scheme == "file" { + return + } + result = Parser::parse_port(new_port, |_| (), || default_port(scheme)) + } + if let Ok((new_port, _remaining)) = result { + url.set_port_inner(new_port) + } } /// Getter for https://url.spec.whatwg.org/#dom-url-pathname From a1f389fc838d238cb85f84615a149d655b3a9520 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 1 Mar 2016 18:25:40 +0100 Subject: [PATCH 41/89] All setters. --- src/host.rs | 23 ++++++++++------------- src/lib.rs | 43 ++++++++++++++++++++++++------------------- src/parser.rs | 35 +++++++++++++++++++---------------- src/webidl.rs | 43 +++++++++++++++++++++++++++++++++++-------- 4 files changed, 88 insertions(+), 56 deletions(-) diff --git a/src/host.rs b/src/host.rs index 5eb01a63..cb960a62 100644 --- a/src/host.rs +++ b/src/host.rs @@ -24,6 +24,16 @@ pub enum HostInternal { Ipv6(Ipv6Addr), } +impl From> for HostInternal { + fn from(host: Host) -> HostInternal { + match host { + Host::Domain(_) => HostInternal::Domain, + Host::Ipv4(address) => HostInternal::Ipv4(address), + Host::Ipv6(address) => HostInternal::Ipv6(address), + } + } +} + /// The host name of an URL. #[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] @@ -157,19 +167,6 @@ impl Iterator for SocketAddrs { } } -/// Parse `input` as a host. -/// If successful, write its serialization to `serialization` -/// and return the internal representation for `Url`. -pub fn parse(input: &str, serialization: &mut String) -> ParseResult { - let host = try!(Host::parse(input)); - write!(serialization, "{}", host).unwrap(); - match host { - Host::Domain(_) => Ok(HostInternal::Domain), - Host::Ipv4(address) => Ok(HostInternal::Ipv4(address)), - Host::Ipv6(address) => Ok(HostInternal::Ipv6(address)), - } -} - fn write_ipv6(addr: &Ipv6Addr, f: &mut Formatter) -> fmt::Result { let segments = addr.segments(); let (compress_start, compress_end) = longest_zero_sequence(&segments); diff --git a/src/lib.rs b/src/lib.rs index c0f081ac..1e2f3778 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -573,11 +573,11 @@ impl Url { if port.is_some() && port == parser::default_port(self.scheme()) { port = None } - self.set_port_inner(port); + self.set_port_internal(port); Ok(()) } - fn set_port_inner(&mut self, port: Option) { + fn set_port_internal(&mut self, port: Option) { match (self.port, port) { (None, None) => {} (Some(_), None) => { @@ -611,7 +611,8 @@ impl Url { /// Change this URL’s host. /// - /// If this URL is non-relative, do nothing and return `Err`. + /// If this URL is non-relative or there is an error parsing the given `host`, + /// do nothing and return `Err`. /// /// Removing the host (calling this with `None`) /// will also remove any username, password, and port number. @@ -621,7 +622,7 @@ impl Url { } if let Some(host) = host { - self.set_host_internal(try!(Host::parse(host).map_err(|_| ()))) + self.set_host_internal(try!(Host::parse(host).map_err(|_| ())), None) } else if self.has_host() { // Not debug_assert! since this proves that `unsafe` below is OK: assert!(self.byte_at(self.scheme_end) == b':'); @@ -643,8 +644,10 @@ impl Url { Ok(()) } - fn set_host_internal(&mut self, host: Host) { - let after_host = self.slice(self.host_end..).to_owned(); + /// opt_new_port: None means leave unchanged, Some(None) means remove any port number. + fn set_host_internal(&mut self, host: Host, opt_new_port: Option>) { + let old_suffix_pos = if opt_new_port.is_some() { self.path_start } else { self.host_end }; + let suffix = self.slice(old_suffix_pos..).to_owned(); self.serialization.truncate(self.host_start as usize); if !self.has_host() { debug_assert!(self.slice(self.scheme_end..self.host_start) == ":"); @@ -654,20 +657,22 @@ impl Url { self.username_end += 2; self.host_start += 2; } - let old_host_end = self.host_end; write!(&mut self.serialization, "{}", host).unwrap(); - let new_host_end = to_u32(self.serialization.len()).unwrap(); - self.serialization.push_str(&after_host); + self.host_end = to_u32(self.serialization.len()).unwrap(); + self.host = host.into(); + + if let Some(new_port) = opt_new_port { + self.port = new_port; + if let Some(port) = new_port { + write!(&mut self.serialization, ":{}", port).unwrap(); + } + } + let new_suffix_pos = to_u32(self.serialization.len()).unwrap(); + self.serialization.push_str(&suffix); - self.host = match host { - Host::Domain(_) => HostInternal::Domain, - Host::Ipv4(address) => HostInternal::Ipv4(address), - Host::Ipv6(address) => HostInternal::Ipv6(address), - }; - self.host_end = new_host_end; let adjust = |index: &mut u32| { - *index -= old_host_end; - *index += new_host_end; + *index -= old_suffix_pos; + *index += new_suffix_pos; }; adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { adjust(index) } @@ -684,7 +689,7 @@ impl Url { return Err(()) } - self.set_host_internal(Host::Ipv4(address)); + self.set_host_internal(Host::Ipv4(address), None); Ok(()) } @@ -698,7 +703,7 @@ impl Url { return Err(()) } - self.set_host_internal(Host::Ipv6(address)); + self.set_host_internal(Host::Ipv6(address), None); Ok(()) } diff --git a/src/parser.rs b/src/parser.rs index ba177d63..85dc39af 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -11,7 +11,7 @@ use std::error::Error; use std::fmt::{self, Formatter, Write}; use super::{Url, EncodingOverride}; -use host::{self, HostInternal}; +use host::{Host, HostInternal}; use percent_encoding::{ utf8_percent_encode, percent_encode, SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET, @@ -603,7 +603,9 @@ impl<'a> Parser<'a> { pub fn parse_host_and_port<'i>(&mut self, input: &'i str, scheme_end: u32, scheme_type: SchemeType) -> ParseResult<(u32, HostInternal, Option, &'i str)> { - let (host, remaining) = try!(self.parse_host(input, scheme_type)); + let (host, remaining) = try!( + Parser::parse_host(input, scheme_type, |m| self.syntax_violation(m))); + write!(&mut self.serialization, "{}", host).unwrap(); let host_end = try!(to_u32(self.serialization.len())); let (port, remaining) = if remaining.starts_with(":") { let syntax_violation = |message| self.syntax_violation(message); @@ -615,11 +617,12 @@ impl<'a> Parser<'a> { if let Some(port) = port { write!(&mut self.serialization, ":{}", port).unwrap() } - Ok((host_end, host, port, remaining)) + Ok((host_end, host.into(), port, remaining)) } - pub fn parse_host<'i>(&mut self, input: &'i str, scheme_type: SchemeType) - -> ParseResult<(HostInternal, &'i str)> { + pub fn parse_host<'i, S>(input: &'i str, scheme_type: SchemeType, syntax_violation: S) + -> ParseResult<(Host, &'i str)> + where S: Fn(&'static str) { let mut inside_square_brackets = false; let mut has_ignored_chars = false; let mut end = input.len(); @@ -638,7 +641,7 @@ impl<'a> Parser<'a> { break } b'\t' | b'\n' | b'\r' => { - self.syntax_violation("invalid character"); + syntax_violation("invalid character"); has_ignored_chars = true; } b'[' => inside_square_brackets = true, @@ -656,7 +659,7 @@ impl<'a> Parser<'a> { if scheme_type.is_special() && host_input.is_empty() { return Err(ParseError::EmptyHost) } - let host = try!(host::parse(&host_input, &mut self.serialization)); + let host = try!(Host::parse(&host_input)); Ok((host, &input[end..])) } @@ -687,17 +690,17 @@ impl<'a> Parser<'a> { if is_windows_drive_letter(host_input) { return Ok((false, HostInternal::None, input)) } - let mut host; - if host_input.is_empty() { - host = HostInternal::None; + let host = if host_input.is_empty() { + HostInternal::None } else { - let host_start = self.serialization.len(); - host = try!(host::parse(&host_input, &mut self.serialization)); - if &self.serialization[host_start..] == "localhost" { - host = HostInternal::None; - self.serialization.truncate(host_start); + match try!(Host::parse(&host_input)) { + Host::Domain(ref d) if d == "localhost" => HostInternal::None, + host => { + write!(&mut self.serialization, "{}", host).unwrap(); + host.into() + } } - } + }; Ok((true, host, &input[end..])) } diff --git a/src/webidl.rs b/src/webidl.rs index 2ccb6766..22c734a6 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -9,7 +9,7 @@ use {Url, ParseError}; use host::Host; use idna::domain_to_unicode; -use parser::{Parser, default_port}; +use parser::{Parser, SchemeType, default_port}; /// https://url.spec.whatwg.org/#api pub struct WebIdl; @@ -89,9 +89,30 @@ impl WebIdl { host } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-host - pub fn set_host(_url: &mut Url, _new_host: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-host + pub fn set_host(url: &mut Url, new_host: &str) { + if url.non_relative() { + return + } + let host; + let opt_port; + { + let scheme = url.scheme(); + let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); + match result { + Ok((h, remaining)) => { + host = h; + opt_port = if remaining.starts_with(':') { + Parser::parse_port(remaining, |_| (), || default_port(scheme)) + .ok().map(|(port, _remaining)| port) + } else { + None + }; + } + Err(_) => return + } + } + url.set_host_internal(host, opt_port) } /// Getter for https://url.spec.whatwg.org/#dom-url-hostname @@ -100,9 +121,15 @@ impl WebIdl { url.host_str().unwrap_or("") } - /// **Not implemented yet** Setter for https://url.spec.whatwg.org/#dom-url-hostname - pub fn set_hostname(_url: &mut Url, _new_hostname: &str) { - unimplemented!() // FIXME + /// Setter for https://url.spec.whatwg.org/#dom-url-hostname + pub fn set_hostname(url: &mut Url, new_hostname: &str) { + if url.non_relative() { + return + } + let result = Parser::parse_host(new_hostname, SchemeType::from(url.scheme()), |_| ()); + if let Ok((host, _remaining)) = result { + url.set_host_internal(host, None) + } } /// Getter for https://url.spec.whatwg.org/#dom-url-port @@ -128,7 +155,7 @@ impl WebIdl { result = Parser::parse_port(new_port, |_| (), || default_port(scheme)) } if let Ok((new_port, _remaining)) = result { - url.set_port_inner(new_port) + url.set_port_internal(new_port) } } From cb61d4363515fc6a609bb52ff16dbc9a2b62dafd Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 23 Mar 2016 18:16:48 +0100 Subject: [PATCH 42/89] Replase set_ipv{4,6}_host with set_ip_host taking IpAddr. `std::net::IpAddr` is now `#[stable]` in the stable channel. (Rust 1.7+) --- src/lib.rs | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1e2f3778..db26da98 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -134,7 +134,7 @@ use std::fmt::{self, Write}; use std::hash; use std::io; use std::mem; -use std::net::{ToSocketAddrs, Ipv4Addr, Ipv6Addr}; +use std::net::{ToSocketAddrs, IpAddr}; use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; @@ -679,31 +679,21 @@ impl Url { if let Some(ref mut index) = self.fragment_start { adjust(index) } } - /// Change this URL’s host to the given IPv4 address. + /// Change this URL’s host to the given IP address. /// /// If this URL is non-relative, do nothing and return `Err`. /// /// Compared to `Url::set_host`, this skips the host parser. - pub fn set_ipv4_host(&mut self, address: Ipv4Addr) -> Result<(), ()> { + pub fn set_ip_host(&mut self, address: IpAddr) -> Result<(), ()> { if self.non_relative() { return Err(()) } - self.set_host_internal(Host::Ipv4(address), None); - Ok(()) - } - - /// Change this URL’s host to the given IPv6 address. - /// - /// If this URL is non-relative, do nothing and return `Err`. - /// - /// Compared to `Url::set_host`, this skips the host parser. - pub fn set_ipv6_host(&mut self, address: Ipv6Addr) -> Result<(), ()> { - if self.non_relative() { - return Err(()) - } - - self.set_host_internal(Host::Ipv6(address), None); + let address = match address { + IpAddr::V4(address) => Host::Ipv4(address), + IpAddr::V6(address) => Host::Ipv6(address), + }; + self.set_host_internal(address, None); Ok(()) } From 450af0e482dfe6a1cc81f5cf032380e28c4e43f2 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 23 Mar 2016 18:29:45 +0100 Subject: [PATCH 43/89] Maintain the invariant that an URL can not be both non-relative and special. --- src/lib.rs | 9 +++++---- src/parser.rs | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index db26da98..ee656088 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -142,7 +142,7 @@ use std::str; pub use encoding::EncodingOverride; pub use origin::{Origin, OpaqueOrigin}; pub use host::{Host, HostAndPort, SocketAddrs}; -pub use parser::{ParseError, to_u32}; +pub use parser::{ParseError, SchemeType, to_u32}; pub use slicing::Position; pub use webidl::WebIdl; @@ -479,7 +479,7 @@ impl Url { (None, None) => (to_u32(self.serialization.len()).unwrap(), String::new()) }; let non_relative = self.non_relative(); - let scheme_type = parser::SchemeType::from(self.scheme()); + let scheme_type = SchemeType::from(self.scheme()); self.serialization.truncate(self.path_start as usize); self.mutate(|parser| { if non_relative { @@ -547,7 +547,7 @@ impl Url { }, (None, None) => String::new() }; - let scheme_type = parser::SchemeType::from(self.scheme()); + let scheme_type = SchemeType::from(self.scheme()); let path_start = self.path_start as usize; self.serialization.push('/'); self.mutate(|parser| { @@ -797,7 +797,8 @@ impl Url { -> Result<(), ()> { let mut parser = Parser::for_setter(String::new()); let remaining = try!(parser.parse_scheme(scheme)); - if !(remaining.is_empty() || allow_extra_input_after_colon) { + if (!remaining.is_empty() && !allow_extra_input_after_colon) || + (!self.has_host() && SchemeType::from(&parser.serialization).is_special()) { return Err(()) } let old_scheme_end = self.scheme_end; diff --git a/src/parser.rs b/src/parser.rs index 85dc39af..82bb00f7 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -72,11 +72,11 @@ pub enum SchemeType { } impl SchemeType { - fn is_special(&self) -> bool { + pub fn is_special(&self) -> bool { !matches!(*self, SchemeType::NotSpecial) } - fn is_file(&self) -> bool { + pub fn is_file(&self) -> bool { matches!(*self, SchemeType::File) } From 4ee76816b87632286676226820a5860f9e70d3f4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 23 Mar 2016 19:17:21 +0100 Subject: [PATCH 44/89] Add Url::into_string --- src/lib.rs | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index ee656088..a89cab2f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -218,11 +218,22 @@ impl Url { }.parse_url(input) } + /// Return the serialization of this URL. + /// + /// This is fast since that serialization is already stored in the `Url` struct. #[inline] pub fn as_str(&self) -> &str { &self.serialization } + /// Return the serialization of this URL. + /// + /// This consumes the `Url` and takes ownership of the `String` stored in it. + #[inline] + pub fn into_string(self) -> String { + self.serialization + } + /// Return the scheme of this URL, lower-cased, as an ASCII string without the ':' delimiter. #[inline] pub fn scheme(&self) -> &str { From 9bf2c60699a337d007cb002700259aa0c8f190b9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 23 Mar 2016 20:25:30 +0100 Subject: [PATCH 45/89] Rename non-relative to cannot-be-a-base, per upcoming spec change. See https://github.com/whatwg/url/issues/105 --- src/lib.rs | 54 ++++++++++++++++++++++------------------- src/parser.rs | 14 +++++------ src/percent_encoding.rs | 2 +- src/webidl.rs | 8 +++--- 4 files changed, 41 insertions(+), 37 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index a89cab2f..4c2504ba 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -68,10 +68,10 @@ assert!(issue_list_url.path_segments().map(|c| c.collect::>()) == Some(vec!["rust-lang", "rust", "issues"])); assert!(issue_list_url.query() == Some("labels=E-easy&state=open")); assert!(issue_list_url.fragment() == None); -assert!(!issue_list_url.non_relative()); +assert!(!issue_list_url.cannot_be_a_base()); ``` -Some URLs are said to be "non-relative": +Some URLs are said to be *cannot-be-a-base*: they don’t have a username, password, host, or port, and their "path" is an arbitrary string rather than slash-separated segments: @@ -80,7 +80,7 @@ use url::Url; let data_url = Url::parse("data:text/plain,Hello?World#").unwrap(); -assert!(data_url.non_relative()); +assert!(data_url.cannot_be_a_base()); assert!(data_url.scheme() == "data"); assert!(data_url.path() == "text/plain,Hello"); assert!(data_url.path_segments().is_none()); @@ -247,9 +247,13 @@ impl Url { self.slice(self.scheme_end + 1 ..).starts_with("//") } - /// Return whether this URL is non-relative (typical of e.g. `data:` and `mailto:` URLs.) + /// Return whether this URL is a cannot-be-a-base URL, + /// meaning that parsing a relative URL string with this URL as the base will return an error. + /// + /// This is the case if the scheme and `:` delimiter are not followed by a `/` slash, + /// as is typically the case of `data:` and `mailto:` URLs. #[inline] - pub fn non_relative(&self) -> bool { + pub fn cannot_be_a_base(&self) -> bool { self.byte_at(self.path_start) != b'/' } @@ -282,7 +286,7 @@ impl Url { /// Non-ASCII domains are punycode-encoded per IDNA. /// IPv6 addresses are given between `[` and `]` brackets. /// - /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// Cannot-be-a-base URLs (typical of `data:` and `mailto:`) and some `file:` URLs /// don’t have a host. /// /// See also the `host` method. @@ -297,7 +301,7 @@ impl Url { /// Return the parsed representation of the host for this URL. /// Non-ASCII domain labels are punycode-encoded per IDNA. /// - /// Non-relative URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// Cannot-be-a-base URLs (typical of `data:` and `mailto:`) and some `file:` URLs /// don’t have a host. /// /// See also the `host_str` method. @@ -381,7 +385,7 @@ impl Url { /// Return the path for this URL, as a percent-encoded ASCII string. /// For relative URLs, this starts with a '/' slash /// and continues with slash-separated path segments. - /// For non-relative URLs, this is an arbitrary string that doesn’t start with '/'. + /// For cannot-be-a-base URLs, this is an arbitrary string that doesn’t start with '/'. pub fn path(&self) -> &str { match (self.query_start, self.fragment_start) { (None, None) => self.slice(self.path_start..), @@ -395,7 +399,7 @@ impl Url { /// If this URL is relative, return an iterator of '/' slash-separated path segments, /// each as a percent-encoded ASCII string. /// - /// Return `None` for non-relative URLs, or an iterator of at least one string. + /// Return `None` for cannot-be-a-base URLs, or an iterator of at least one string. pub fn path_segments(&self) -> Option> { let path = self.path(); if path.starts_with('/') { @@ -489,16 +493,16 @@ impl Url { (Some(i), _) | (None, Some(i)) => (i, self.slice(i..).to_owned()), (None, None) => (to_u32(self.serialization.len()).unwrap(), String::new()) }; - let non_relative = self.non_relative(); + let cannot_be_a_base = self.cannot_be_a_base(); let scheme_type = SchemeType::from(self.scheme()); self.serialization.truncate(self.path_start as usize); self.mutate(|parser| { - if non_relative { + if cannot_be_a_base { if path.starts_with('/') { parser.serialization.push_str("%2F"); - parser.parse_non_relative_path(&path[1..]); + parser.parse_cannot_be_a_base_path(&path[1..]); } else { - parser.parse_non_relative_path(path); + parser.parse_cannot_be_a_base_path(path); } } else { let mut has_host = true; // FIXME @@ -517,9 +521,9 @@ impl Url { /// Remove the last segment of this URL’s path. /// - /// If this URL is non-relative, do nothing and return `Err`. + /// If this URL is cannot-be-a-base, do nothing and return `Err`. pub fn pop_path_segment(&mut self) -> Result<(), ()> { - if self.non_relative() { + if self.cannot_be_a_base() { return Err(()) } let last_slash; @@ -545,9 +549,9 @@ impl Url { /// Add a segment at the end of this URL’s path. /// - /// If this URL is non-relative, do nothing and return `Err`. + /// If this URL is cannot-be-a-base, do nothing and return `Err`. pub fn push_path_segment(&mut self, segment: &str) -> Result<(), ()> { - if self.non_relative() { + if self.cannot_be_a_base() { return Err(()) } let after_path = match (self.query_start, self.fragment_start) { @@ -575,7 +579,7 @@ impl Url { /// Change this URL’s port number. /// - /// If this URL is non-relative, does not have a host, or has the `file` scheme; + /// If this URL is cannot-be-a-base, does not have a host, or has the `file` scheme; /// do nothing and return `Err`. pub fn set_port(&mut self, mut port: Option) -> Result<(), ()> { if !self.has_host() || self.scheme() == "file" { @@ -622,13 +626,13 @@ impl Url { /// Change this URL’s host. /// - /// If this URL is non-relative or there is an error parsing the given `host`, + /// If this URL is cannot-be-a-base or there is an error parsing the given `host`, /// do nothing and return `Err`. /// /// Removing the host (calling this with `None`) /// will also remove any username, password, and port number. pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ()> { - if self.non_relative() { + if self.cannot_be_a_base() { return Err(()) } @@ -692,11 +696,11 @@ impl Url { /// Change this URL’s host to the given IP address. /// - /// If this URL is non-relative, do nothing and return `Err`. + /// If this URL is cannot-be-a-base, do nothing and return `Err`. /// /// Compared to `Url::set_host`, this skips the host parser. pub fn set_ip_host(&mut self, address: IpAddr) -> Result<(), ()> { - if self.non_relative() { + if self.cannot_be_a_base() { return Err(()) } @@ -710,7 +714,7 @@ impl Url { /// Change this URL’s password. /// - /// If this URL is non-relative or does not have a host, do nothing and return `Err`. + /// If this URL is cannot-be-a-base or does not have a host, do nothing and return `Err`. pub fn set_password(&mut self, password: Option<&str>) -> Result<(), ()> { if !self.has_host() { return Err(()) @@ -760,7 +764,7 @@ impl Url { /// Change this URL’s username. /// - /// If this URL is non-relative or does not have a host, do nothing and return `Err`. + /// If this URL is cannot-be-a-base or does not have a host, do nothing and return `Err`. pub fn set_username(&mut self, username: &str) -> Result<(), ()> { if !self.has_host() { return Err(()) @@ -798,7 +802,7 @@ impl Url { /// /// Do nothing and return `Err` if: /// * The new scheme is not in `[a-zA-Z][a-zA-Z0-9+.-]+` - /// * This URL is non-relative and the new scheme is one of + /// * This URL is cannot-be-a-base and the new scheme is one of /// `http`, `https`, `ws`, `wss`, `ftp`, or `gopher` pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ()> { self.set_scheme_internal(scheme, false) diff --git a/src/parser.rs b/src/parser.rs index 82bb00f7..e4947d89 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -50,7 +50,7 @@ simple_enum_error! { InvalidIpv6Address => "invalid IPv6 address", InvalidDomainCharacter => "invalid domain character", RelativeUrlWithoutBase => "relative URL without a base", - RelativeUrlWithNonRelativeBase => "relative URL with a non-relative base", + RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base", Overflow => "URLs more than 4 GB are not supported", } @@ -154,8 +154,8 @@ impl<'a> Parser<'a> { if let Some(base_url) = self.base_url { if input.starts_with("#") { self.fragment_only(base_url, input) - } else if base_url.non_relative() { - Err(ParseError::RelativeUrlWithNonRelativeBase) + } else if base_url.cannot_be_a_base() { + Err(ParseError::RelativeUrlWithCannotBeABaseBase) } else { let scheme_type = SchemeType::from(base_url.scheme()); if scheme_type.is_file() { @@ -214,8 +214,8 @@ impl<'a> Parser<'a> { if let Some(base_url) = self.base_url { if slashes_count < 2 && base_url.scheme() == &self.serialization[..scheme_end as usize] { - // Non-relative URLs only happen with "not special" schemes. - debug_assert!(!base_url.non_relative()); + // "Cannot-be-a-base" URLs only happen with "not special" schemes. + debug_assert!(!base_url.cannot_be_a_base()); self.serialization.clear(); return self.parse_relative(input, scheme_type, base_url) } @@ -247,7 +247,7 @@ impl<'a> Parser<'a> { self.serialization.push('/'); self.parse_path(scheme_type, &mut false, path_start, &input[1..]) } else { - self.parse_non_relative_path(input) + self.parse_cannot_be_a_base_path(input) }; self.with_query_and_fragment(scheme_end, username_end, host_start, host_end, host, port, path_start, remaining) @@ -858,7 +858,7 @@ impl<'a> Parser<'a> { } - pub fn parse_non_relative_path<'i>(&mut self, input: &'i str) -> &'i str { + pub fn parse_cannot_be_a_base_path<'i>(&mut self, input: &'i str) -> &'i str { for (i, c, next_i) in input.char_ranges() { match c { '?' | '#' if self.context == Context::UrlParser => return &input[i..], diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index d9a26563..9f60f5c6 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -74,7 +74,7 @@ macro_rules! define_encode_set { } } -/// This encode set is used for fragment identifier and non-relative scheme data. +/// This encode set is used for the path of cannot-be-a-base URLs. #[derive(Copy, Clone)] #[allow(non_camel_case_types)] pub struct SIMPLE_ENCODE_SET; diff --git a/src/webidl.rs b/src/webidl.rs index 22c734a6..4247427c 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -91,7 +91,7 @@ impl WebIdl { /// Setter for https://url.spec.whatwg.org/#dom-url-host pub fn set_host(url: &mut Url, new_host: &str) { - if url.non_relative() { + if url.cannot_be_a_base() { return } let host; @@ -123,7 +123,7 @@ impl WebIdl { /// Setter for https://url.spec.whatwg.org/#dom-url-hostname pub fn set_hostname(url: &mut Url, new_hostname: &str) { - if url.non_relative() { + if url.cannot_be_a_base() { return } let result = Parser::parse_host(new_hostname, SchemeType::from(url.scheme()), |_| ()); @@ -147,7 +147,7 @@ impl WebIdl { pub fn set_port(url: &mut Url, new_port: &str) { let result; { - // has_host implies !non_relative + // has_host implies !cannot_be_a_base let scheme = url.scheme(); if !url.has_host() || scheme == "file" { return @@ -167,7 +167,7 @@ impl WebIdl { /// Setter for https://url.spec.whatwg.org/#dom-url-pathname pub fn set_pathname(url: &mut Url, new_pathname: &str) { - if !url.non_relative() { + if !url.cannot_be_a_base() { url.set_path(new_pathname) } } From ca6a4aeaa3ddf1e11b709e9e240809ee9f040c4a Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 23 Mar 2016 20:54:41 +0100 Subject: [PATCH 46/89] Remove some `unsafe` blocks. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Turns out `String` has a `drain` method too. --- idna/src/punycode.rs | 15 +++++++-------- src/lib.rs | 23 ++++++----------------- src/parser.rs | 5 +++-- 3 files changed, 16 insertions(+), 27 deletions(-) diff --git a/idna/src/punycode.rs b/idna/src/punycode.rs index 27525faf..9e5f1769 100644 --- a/idna/src/punycode.rs +++ b/idna/src/punycode.rs @@ -185,11 +185,11 @@ pub fn encode(input: &[char]) -> Option { break } let value = t + ((q - t) % (BASE - t)); - value_to_digit(value, &mut output); + output.push(value_to_digit(value)); q = (q - t) / (BASE - t); k += BASE; } - value_to_digit(q, &mut output); + output.push(value_to_digit(q)); bias = adapt(delta, processed + 1, processed == basic_length); delta = 0; processed += 1; @@ -203,11 +203,10 @@ pub fn encode(input: &[char]) -> Option { #[inline] -fn value_to_digit(value: u32, output: &mut String) { - let code_point = match value { - 0 ... 25 => value + 0x61, // a..z - 26 ... 35 => value - 26 + 0x30, // 0..9 +fn value_to_digit(value: u32) -> char { + match value { + 0 ... 25 => (value as u8 + 'a' as u8) as char, // a..z + 26 ... 35 => (value as u8 - 26 + '0' as u8) as char, // 0..9 _ => panic!() - }; - unsafe { output.as_mut_vec().push(code_point as u8) } + } } diff --git a/src/lib.rs b/src/lib.rs index 4c2504ba..f1e82f25 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -537,9 +537,7 @@ impl Url { // Found a slash other than the initial one let last_slash = last_slash + self.path_start as usize; let path_end = path_len + self.path_start as usize; - unsafe { - self.serialization.as_mut_vec().drain(last_slash..path_end); - } + self.serialization.drain(last_slash..path_end); let offset = (path_end - last_slash) as u32; if let Some(ref mut index) = self.query_start { *index -= offset } if let Some(ref mut index) = self.fragment_start { *index -= offset } @@ -596,10 +594,7 @@ impl Url { match (self.port, port) { (None, None) => {} (Some(_), None) => { - unsafe { - self.serialization.as_mut_vec().drain( - self.host_end as usize .. self.path_start as usize); - } + self.serialization.drain(self.host_end as usize .. self.path_start as usize); let offset = self.path_start - self.host_end; self.path_start = self.host_end; if let Some(ref mut index) = self.query_start { *index -= offset } @@ -639,14 +634,10 @@ impl Url { if let Some(host) = host { self.set_host_internal(try!(Host::parse(host).map_err(|_| ())), None) } else if self.has_host() { - // Not debug_assert! since this proves that `unsafe` below is OK: - assert!(self.byte_at(self.scheme_end) == b':'); - assert!(self.byte_at(self.path_start) == b'/'); + debug_assert!(self.byte_at(self.scheme_end) == b':'); + debug_assert!(self.byte_at(self.path_start) == b'/'); let new_path_start = self.scheme_end + 1; - unsafe { - self.serialization.as_mut_vec() - .drain(self.path_start as usize..new_path_start as usize); - } + self.serialization.drain(self.path_start as usize..new_path_start as usize); let offset = self.path_start - new_path_start; self.path_start = new_path_start; self.username_end = new_path_start; @@ -750,9 +741,7 @@ impl Url { } else { self.host_start - 1 // Keep the '@' to separate the username from the host }; - unsafe { - self.serialization.as_mut_vec().drain(start as usize .. end as usize); - } + self.serialization.drain(start as usize .. end as usize); let offset = end - start; self.host_start -= offset; self.host_end -= offset; diff --git a/src/parser.rs b/src/parser.rs index e4947d89..7d3eb7af 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -820,8 +820,9 @@ impl<'a> Parser<'a> { if scheme_type.is_file() && is_windows_drive_letter( &self.serialization[path_start + 1..] ) { - unsafe { - *self.serialization.as_mut_vec().last_mut().unwrap() = b':' + if self.serialization.ends_with('|') { + self.serialization.pop(); + self.serialization.push(':'); } if *has_host { self.syntax_violation("file: with host and Windows drive letter"); From 6830decb4c4cd356d16c7da516dd7048b4710003 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 24 Mar 2016 18:09:21 +0100 Subject: [PATCH 47/89] Back to the builder pattern after all. It allows adding new settings (methods) without breaking the API. --- src/encoding.rs | 9 ------- src/lib.rs | 65 ++++++++++++++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 28 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 1679a55a..be53ea19 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -27,10 +27,6 @@ pub struct EncodingOverride { #[cfg(feature = "query_encoding")] impl EncodingOverride { - pub fn from_parse_options(options: &::ParseOptions) -> EncodingOverride { - EncodingOverride::from_opt_encoding(options.encoding_override) - } - pub fn from_opt_encoding(encoding: Option) -> EncodingOverride { encoding.map(EncodingOverride::from_encoding).unwrap_or_else(EncodingOverride::utf8) } @@ -80,11 +76,6 @@ pub struct EncodingOverride; #[cfg(not(feature = "query_encoding"))] impl EncodingOverride { - #[inline] - pub fn from_parse_options(_options: &::ParseOptions) -> EncodingOverride { - EncodingOverride - } - #[inline] pub fn utf8() -> EncodingOverride { EncodingOverride diff --git a/src/lib.rs b/src/lib.rs index f1e82f25..400656a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -183,39 +183,66 @@ pub struct Url { fragment_start: Option, // Before '#', unlike Position::FragmentStart } -#[derive(Default)] +#[derive(Copy, Clone)] pub struct ParseOptions<'a> { - pub base_url: Option<&'a Url>, - #[cfg(feature = "query_encoding")] pub encoding_override: Option, - pub log_syntax_violation: Option<&'a Fn(&'static str)>, + base_url: Option<&'a Url>, + encoding_override: encoding::EncodingOverride, + log_syntax_violation: Option<&'a Fn(&'static str)>, +} + +impl<'a> ParseOptions<'a> { + /// Change the base URL + pub fn base_url(mut self, new: Option<&'a Url>) -> Self { + self.base_url = new; + self + } + + /// Override the character encoding of query strings. + /// This is a legacy concept only relevant for HTML. + #[cfg(feature = "query_encoding")] + pub fn encoding_override(mut self, new: Option) -> Self { + self.encoding_override = EncodingOverride::from_opt_encoding(new); + self + } + + /// Call the provided function or closure on non-fatal parse errors. + pub fn log_syntax_violation(mut self, new: Option<&'a Fn(&'static str)>) -> Self { + self.log_syntax_violation = new; + self + } + + /// Parse an URL string with the configuration so far. + pub fn parse(self, input: &str) -> Result { + Parser { + serialization: String::with_capacity(input.len()), + base_url: self.base_url, + query_encoding_override: self.encoding_override, + log_syntax_violation: self.log_syntax_violation, + context: Context::UrlParser, + }.parse_url(input) + } } impl Url { /// Parse an absolute URL from a string. #[inline] pub fn parse(input: &str) -> Result { - Url::parse_with(input, ParseOptions::default()) + Url::options().parse(input) } /// Parse a string as an URL, with this URL as the base URL. #[inline] pub fn join(&self, input: &str) -> Result { - Url::parse_with(input, ParseOptions { base_url: Some(self), ..Default::default() }) + Url::options().base_url(Some(self)).parse(input) } - /// The URL parser with all of its parameters. - /// - /// `encoding_override` is a legacy concept only relevant for HTML. - /// When it’s not needed, - /// `s.parse::()`, `Url::from_str(s)` and `url.join(s)` can be used instead. - pub fn parse_with(input: &str, options: ParseOptions) -> Result { - Parser { - serialization: String::with_capacity(input.len()), - base_url: options.base_url, - query_encoding_override: EncodingOverride::from_parse_options(&options), - log_syntax_violation: options.log_syntax_violation, - context: Context::UrlParser, - }.parse_url(input) + /// Return a default `ParseOptions` that can fully configure the URL parser. + pub fn options<'a>() -> ParseOptions<'a> { + ParseOptions { + base_url: None, + encoding_override: EncodingOverride::utf8(), + log_syntax_violation: None, + } } /// Return the serialization of this URL. From b4bbaaa521f6e89f6b0b921a76936214398dfd73 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 24 Mar 2016 18:19:09 +0100 Subject: [PATCH 48/89] Docs --- src/host.rs | 1 + src/lib.rs | 7 ++++--- src/origin.rs | 2 ++ src/parser.rs | 3 ++- src/webidl.rs | 2 ++ 5 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/host.rs b/src/host.rs index cb960a62..cc789b24 100644 --- a/src/host.rs +++ b/src/host.rs @@ -143,6 +143,7 @@ impl> ToSocketAddrs for HostAndPort { } } +/// Socket addresses for an URL. pub struct SocketAddrs { state: SocketAddrsState } diff --git a/src/lib.rs b/src/lib.rs index 400656a1..75dd5433 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -125,8 +125,9 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") extern crate idna; +use encoding::EncodingOverride; use host::HostInternal; -use parser::{Parser, Context}; +use parser::{Parser, Context, SchemeType, to_u32}; use percent_encoding::{PATH_SEGMENT_ENCODE_SET, USERINFO_ENCODE_SET, percent_encode, percent_decode, utf8_percent_encode}; use std::cmp; @@ -139,10 +140,9 @@ use std::ops::{Range, RangeFrom, RangeTo}; use std::path::{Path, PathBuf}; use std::str; -pub use encoding::EncodingOverride; pub use origin::{Origin, OpaqueOrigin}; pub use host::{Host, HostAndPort, SocketAddrs}; -pub use parser::{ParseError, SchemeType, to_u32}; +pub use parser::ParseError; pub use slicing::Position; pub use webidl::WebIdl; @@ -183,6 +183,7 @@ pub struct Url { fragment_start: Option, // Before '#', unlike Position::FragmentStart } +/// Full configuration for the URL parser. #[derive(Copy, Clone)] pub struct ParseOptions<'a> { base_url: Option<&'a Url>, diff --git a/src/origin.rs b/src/origin.rs index f686768c..37ec8b0f 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -52,6 +52,8 @@ impl Origin { Origin::Opaque(OpaqueOrigin(Arc::new(0))) } + /// Return whether this origin is a (scheme, host, port) tuple + /// (as opposed to an opaque origin). pub fn is_tuple(&self) -> bool { matches!(*self, Origin::Tuple(..)) } diff --git a/src/parser.rs b/src/parser.rs index 7d3eb7af..d52ad182 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -10,7 +10,8 @@ use std::ascii::AsciiExt; use std::error::Error; use std::fmt::{self, Formatter, Write}; -use super::{Url, EncodingOverride}; +use Url; +use encoding::EncodingOverride; use host::{Host, HostInternal}; use percent_encoding::{ utf8_percent_encode, percent_encode, diff --git a/src/webidl.rs b/src/webidl.rs index 4247427c..c1ce3636 100644 --- a/src/webidl.rs +++ b/src/webidl.rs @@ -34,10 +34,12 @@ impl WebIdl { } } + /// Getter for https://url.spec.whatwg.org/#dom-url-href pub fn href(url: &Url) -> &str { &url.serialization } + /// Setter for https://url.spec.whatwg.org/#dom-url-href pub fn set_href(url: &mut Url, value: &str) -> Result<(), ParseError> { *url = try!(Url::parse(value)); Ok(()) From ef0a1b2a92b1e4a7eedce7fcab0e299814a3ee86 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 25 Mar 2016 18:30:00 +0100 Subject: [PATCH 49/89] Move webidl.rs to Servo. --- src/lib.rs | 75 ++++++++++++++-- src/webidl.rs | 232 -------------------------------------------------- tests/wpt.rs | 44 ++++++---- 3 files changed, 93 insertions(+), 258 deletions(-) delete mode 100644 src/webidl.rs diff --git a/src/lib.rs b/src/lib.rs index 75dd5433..38567c72 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -144,14 +144,12 @@ pub use origin::{Origin, OpaqueOrigin}; pub use host::{Host, HostAndPort, SocketAddrs}; pub use parser::ParseError; pub use slicing::Position; -pub use webidl::WebIdl; mod encoding; mod host; mod origin; mod parser; mod slicing; -mod webidl; pub mod percent_encoding; pub mod form_urlencoded; @@ -822,14 +820,9 @@ impl Url { /// * This URL is cannot-be-a-base and the new scheme is one of /// `http`, `https`, `ws`, `wss`, `ftp`, or `gopher` pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ()> { - self.set_scheme_internal(scheme, false) - } - - fn set_scheme_internal(&mut self, scheme: &str, allow_extra_input_after_colon: bool) - -> Result<(), ()> { let mut parser = Parser::for_setter(String::new()); let remaining = try!(parser.parse_scheme(scheme)); - if (!remaining.is_empty() && !allow_extra_input_after_colon) || + if !remaining.is_empty() || (!self.has_host() && SchemeType::from(&parser.serialization).is_special()) { return Err(()) } @@ -853,6 +846,72 @@ impl Url { Ok(()) } + /// Setter for https://url.spec.whatwg.org/#dom-url-host + /// + /// Unless you need to be interoperable with web browsers, + /// use `set_host` and `set_port` instead. + pub fn quirky_set_host_and_port(&mut self, new_host: &str) -> Result<(), ()> { + if self.cannot_be_a_base() { + return Err(()) + } + let host; + let opt_port; + { + let scheme = self.scheme(); + let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); + match result { + Ok((h, remaining)) => { + host = h; + opt_port = if remaining.starts_with(':') { + Parser::parse_port(remaining, |_| (), || parser::default_port(scheme)) + .ok().map(|(port, _remaining)| port) + } else { + None + }; + } + Err(_) => return Err(()) + } + } + self.set_host_internal(host, opt_port); + Ok(()) + } + + /// Setter for https://url.spec.whatwg.org/#dom-url-hostname + /// + /// Unless you need to be interoperable with web browsers, use `set_host` instead. + pub fn quirky_set_host(&mut self, new_hostname: &str) -> Result<(), ()> { + if self.cannot_be_a_base() { + return Err(()) + } + let result = Parser::parse_host(new_hostname, SchemeType::from(self.scheme()), |_| ()); + if let Ok((host, _remaining)) = result { + self.set_host_internal(host, None); + Ok(()) + } else { + Err(()) + } + } + + /// Setter for https://url.spec.whatwg.org/#dom-url-port + /// + /// Unless you need to be interoperable with web browsers, use `set_port` instead. + pub fn quirky_set_port(&mut self, new_port: &str) -> Result<(), ()> { + let result; + { + // has_host implies !cannot_be_a_base + let scheme = self.scheme(); + if !self.has_host() || scheme == "file" { + return Err(()) + } + result = Parser::parse_port(new_port, |_| (), || parser::default_port(scheme)) + } + if let Ok((new_port, _remaining)) = result { + self.set_port_internal(new_port); + Ok(()) + } else { + Err(()) + } + } /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// /// This returns `Err` if the given path is not absolute or, diff --git a/src/webidl.rs b/src/webidl.rs deleted file mode 100644 index c1ce3636..00000000 --- a/src/webidl.rs +++ /dev/null @@ -1,232 +0,0 @@ -// Copyright 2016 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -use {Url, ParseError}; -use host::Host; -use idna::domain_to_unicode; -use parser::{Parser, SchemeType, default_port}; - -/// https://url.spec.whatwg.org/#api -pub struct WebIdl; - -impl WebIdl { - /// https://url.spec.whatwg.org/#dom-url-domaintoascii - pub fn domain_to_ascii(domain: &str) -> String { - match Host::parse(domain) { - Ok(Host::Domain(domain)) => domain, - _ => String::new(), - } - } - - /// https://url.spec.whatwg.org/#dom-url-domaintounicode - pub fn domain_to_unicode(domain: &str) -> String { - match Host::parse(domain) { - Ok(Host::Domain(ref domain)) => { - let (unicode, _errors) = domain_to_unicode(domain); - unicode - } - _ => String::new(), - } - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-href - pub fn href(url: &Url) -> &str { - &url.serialization - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-href - pub fn set_href(url: &mut Url, value: &str) -> Result<(), ParseError> { - *url = try!(Url::parse(value)); - Ok(()) - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-origin - pub fn origin(url: &Url) -> String { - url.origin().unicode_serialization() - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-protocol - #[inline] - pub fn protocol(url: &Url) -> &str { - debug_assert!(url.byte_at(url.scheme_end) == b':'); - url.slice(..url.scheme_end + 1) - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-protocol - pub fn set_protocol(url: &mut Url, new_protocol: &str) { - let _ = url.set_scheme_internal(new_protocol, true); - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-username - #[inline] - pub fn username(url: &Url) -> &str { - url.username() - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-username - pub fn set_username(url: &mut Url, new_username: &str) { - let _ = url.set_username(new_username); - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-password - #[inline] - pub fn password(url: &Url) -> &str { - url.password().unwrap_or("") - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-password - pub fn set_password(url: &mut Url, new_password: &str) { - let _ = url.set_password(if new_password.is_empty() { None } else { Some(new_password) }); - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-host - #[inline] - pub fn host(url: &Url) -> &str { - let host = url.slice(url.host_start..url.path_start); - host - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-host - pub fn set_host(url: &mut Url, new_host: &str) { - if url.cannot_be_a_base() { - return - } - let host; - let opt_port; - { - let scheme = url.scheme(); - let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); - match result { - Ok((h, remaining)) => { - host = h; - opt_port = if remaining.starts_with(':') { - Parser::parse_port(remaining, |_| (), || default_port(scheme)) - .ok().map(|(port, _remaining)| port) - } else { - None - }; - } - Err(_) => return - } - } - url.set_host_internal(host, opt_port) - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-hostname - #[inline] - pub fn hostname(url: &Url) -> &str { - url.host_str().unwrap_or("") - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-hostname - pub fn set_hostname(url: &mut Url, new_hostname: &str) { - if url.cannot_be_a_base() { - return - } - let result = Parser::parse_host(new_hostname, SchemeType::from(url.scheme()), |_| ()); - if let Ok((host, _remaining)) = result { - url.set_host_internal(host, None) - } - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-port - #[inline] - pub fn port(url: &Url) -> &str { - if url.port.is_some() { - debug_assert!(url.byte_at(url.host_end) == b':'); - url.slice(url.host_end + 1..url.path_start) - } else { - "" - } - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-port - pub fn set_port(url: &mut Url, new_port: &str) { - let result; - { - // has_host implies !cannot_be_a_base - let scheme = url.scheme(); - if !url.has_host() || scheme == "file" { - return - } - result = Parser::parse_port(new_port, |_| (), || default_port(scheme)) - } - if let Ok((new_port, _remaining)) = result { - url.set_port_internal(new_port) - } - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-pathname - #[inline] - pub fn pathname(url: &Url) -> &str { - url.path() - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-pathname - pub fn set_pathname(url: &mut Url, new_pathname: &str) { - if !url.cannot_be_a_base() { - url.set_path(new_pathname) - } - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-search - pub fn search(url: &Url) -> &str { - match (url.query_start, url.fragment_start) { - (Some(query_start), None) if { - debug_assert!(url.byte_at(query_start) == b'?'); - // If the query (after ?) is not empty - (query_start as usize) < url.serialization.len() - 1 - } => url.slice(query_start..), - - (Some(query_start), Some(fragment_start)) if { - debug_assert!(url.byte_at(query_start) == b'?'); - // If the fragment (after ?) is not empty - query_start < fragment_start - } => url.slice(query_start..fragment_start), - - _ => "", - } - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-search - pub fn set_search(url: &mut Url, new_search: &str) { - url.set_query(match new_search { - "" => None, - _ if new_search.starts_with('?') => Some(&new_search[1..]), - _ => Some(new_search), - }) - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-searchparams - pub fn search_params(url: &Url) -> Vec<(String, String)> { - url.query_pairs().unwrap_or_else(Vec::new) - } - - /// Getter for https://url.spec.whatwg.org/#dom-url-hash - pub fn hash(url: &Url) -> &str { - match url.fragment_start { - Some(start) if { - debug_assert!(url.byte_at(start) == b'#'); - // If the fragment (after #) is not empty - (start as usize) < url.serialization.len() - 1 - } => url.slice(start..), - _ => "", - } - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-hash - pub fn set_hash(url: &mut Url, new_hash: &str) { - if url.scheme() != "javascript" { - url.set_fragment(match new_hash { - "" => None, - _ if new_hash.starts_with('#') => Some(&new_hash[1..]), - _ => Some(new_hash), - }) - } - } -} diff --git a/tests/wpt.rs b/tests/wpt.rs index b7254938..a98fc607 100644 --- a/tests/wpt.rs +++ b/tests/wpt.rs @@ -13,7 +13,7 @@ extern crate test; extern crate url; use rustc_serialize::json::Json; -use url::{Url, WebIdl}; +use url::{Url, Position}; fn run_one(input: String, base: String, expected: Result) { @@ -28,30 +28,38 @@ fn run_one(input: String, base: String, expected: Result) { (Ok(_), Err(())) => panic!("Expected a parse error for URL {:?}", input), }; - macro_rules! assert_getter { - ($attribute: ident) => { assert_getter!($attribute, expected.$attribute) }; - ($attribute: ident, $expected: expr) => { + macro_rules! assert_eq { + ($expected: expr, $got: expr) => { { - let a = WebIdl::$attribute(&url); - let b = $expected; - assert!(a == b, "{:?} != {:?} for URL {:?}", a, b, url); + let expected = $expected; + let got = $got; + assert!(expected == got, "{:?} != {} {:?} for URL {:?}", + got, stringify!($expected), expected, url); } } } - assert_getter!(href); + assert_eq!(expected.href, url.as_str()); if let Some(expected_origin) = expected.origin { - assert_getter!(origin, expected_origin); + assert_eq!(expected_origin, url.origin().unicode_serialization()); + } + assert_eq!(expected.protocol, &url.as_str()[..url.scheme().len() + ":".len()]); + assert_eq!(expected.username, url.username()); + assert_eq!(expected.password, url.password().unwrap_or("")); + assert_eq!(expected.host, &url[Position::BeforeHost..Position::AfterPort]); + assert_eq!(expected.hostname, url.host_str().unwrap_or("")); + assert_eq!(expected.port, &url[Position::BeforePort..Position::AfterPort]); + assert_eq!(expected.pathname, url.path()); + assert_eq!(expected.search, trim(&url[Position::AfterPath..Position::AfterQuery])); + assert_eq!(expected.hash, trim(&url[Position::AfterQuery..])); +} + +fn trim(s: &str) -> &str { + if s.len() == 1 { + "" + } else { + s } - assert_getter!(protocol); - assert_getter!(username); - assert_getter!(password); - assert_getter!(host); - assert_getter!(hostname); - assert_getter!(port); - assert_getter!(pathname); - assert_getter!(search); - assert_getter!(hash); } struct TestCase { From d0c2bc2783a708ad934d8bde186a19c5cc90bd5a Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 30 Mar 2016 19:46:55 +0200 Subject: [PATCH 50/89] One unit tests crate. --- Cargo.toml | 3 +-- tests/form_urlencoded.rs | 29 ----------------------------- tests/tests.rs | 30 ++++++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 31 deletions(-) delete mode 100644 tests/form_urlencoded.rs diff --git a/Cargo.toml b/Cargo.toml index 31804048..b65dc40c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,10 +11,9 @@ readme = "README.md" keywords = ["url", "parser"] license = "MIT/Apache-2.0" -[[test]] -name = "form_urlencoded" [[test]] name = "tests" + [[test]] name = "wpt" harness = false diff --git a/tests/form_urlencoded.rs b/tests/form_urlencoded.rs deleted file mode 100644 index 59080cf9..00000000 --- a/tests/form_urlencoded.rs +++ /dev/null @@ -1,29 +0,0 @@ -extern crate url; - -use url::form_urlencoded::*; - -#[test] -fn test_form_urlencoded() { - let pairs = &[ - ("foo".to_string(), "é&".to_string()), - ("bar".to_string(), "".to_string()), - ("foo".to_string(), "#".to_string()) - ]; - let encoded = serialize(pairs); - assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); - assert_eq!(parse(encoded.as_bytes()), pairs.to_vec()); -} - -#[test] -fn test_form_serialize() { - let pairs = [("foo", "é&"), - ("bar", ""), - ("foo", "#")]; - - let want = "foo=%C3%A9%26&bar=&foo=%23"; - // Works with referenced tuples - assert_eq!(serialize(pairs.iter()), want); - // Works with owned tuples - assert_eq!(serialize(pairs.iter().map(|p| (p.0, p.1))), want); - -} diff --git a/tests/tests.rs b/tests/tests.rs index c363538f..fe964579 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -208,3 +208,33 @@ fn test_serialization() { assert_eq!(url.as_str(), result); } } + +#[test] +fn test_form_urlencoded() { + use url::form_urlencoded::*; + + let pairs = &[ + ("foo".to_string(), "é&".to_string()), + ("bar".to_string(), "".to_string()), + ("foo".to_string(), "#".to_string()) + ]; + let encoded = serialize(pairs); + assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); + assert_eq!(parse(encoded.as_bytes()), pairs.to_vec()); +} + +#[test] +fn test_form_serialize() { + use url::form_urlencoded::*; + + let pairs = [("foo", "é&"), + ("bar", ""), + ("foo", "#")]; + + let want = "foo=%C3%A9%26&bar=&foo=%23"; + // Works with referenced tuples + assert_eq!(serialize(pairs.iter()), want); + // Works with owned tuples + assert_eq!(serialize(pairs.iter().map(|p| (p.0, p.1))), want); + +} From 88133435983b70137935d93aca3d5967945339f1 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 1 Apr 2016 16:42:46 +0200 Subject: [PATCH 51/89] Percent-encoding yields `&str` instead of `char`. --- src/percent_encoding.rs | 128 ++++++++++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 39 deletions(-) diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 9f60f5c6..eb4deab9 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -10,6 +10,7 @@ use std::ascii::AsciiExt; use std::borrow::Cow; use std::fmt::{self, Write}; use std::slice; +use std::str; /// Represents a set of characters / bytes that should be percent-encoded. /// @@ -26,7 +27,7 @@ use std::slice; /// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. pub trait EncodeSet: Clone { /// Called with UTF-8 bytes rather than code points. - /// Should return false for all non-ASCII bytes. + /// Should return true for all non-ASCII bytes. fn contains(&self, byte: u8) -> bool; } @@ -116,82 +117,131 @@ define_encode_set! { } } -/// Percent-encode the given bytes and return an iterator of `char` in the ASCII range. +/// Return the percent-encoding of the given bytes. +/// +/// This is unconditional, unlike `percent_encode()` which uses an encode set. +pub fn percent_encode_byte(byte: u8) -> &'static str { + let index = usize::from(byte) * 3; + &"\ + %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F\ + %10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F\ + %20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F\ + %30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F\ + %40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F\ + %50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F\ + %60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F\ + %70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F\ + %80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F\ + %90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F\ + %A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF\ + %B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF\ + %C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF\ + %D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF\ + %E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF\ + %F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF\ + "[index..index + 3] +} + +/// Percent-encode the given bytes with the given encode set. +/// +/// The encode set define which bytes (in addition to non-ASCII and controls) +/// need to be percent-encoded. +/// The choice of this set depends on context. +/// For example, `?` needs to be encoded in an URL path but not in a query string. +/// +/// The return value is an iterator of `&str` slices (so it has a `.collect::()` method) +/// that also implements `Display` and `Into>`. +/// The latter returns `Cow::Borrowed` when none of the bytes in `input` +/// are in the given encode set. #[inline] pub fn percent_encode(input: &[u8], encode_set: E) -> PercentEncode { PercentEncode { - iter: input.iter(), + bytes: input, encode_set: encode_set, - state: PercentEncodeState::NextByte, } } -/// Percent-encode the UTF-8 encoding of the given string -/// and return an iterator of `char` in the ASCII range. +/// Percent-encode the UTF-8 encoding of the given string. +/// +/// See `percent_decode()` for how to use the return value. #[inline] pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentEncode { percent_encode(input.as_bytes(), encode_set) } +/// The return type of `percent_decode()`. #[derive(Clone)] pub struct PercentEncode<'a, E: EncodeSet> { - iter: slice::Iter<'a, u8>, + bytes: &'a [u8], encode_set: E, - state: PercentEncodeState, -} - -#[derive(Clone)] -enum PercentEncodeState { - NextByte, - HexHigh(u8), - HexLow(u8), } impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { - type Item = char; - - fn next(&mut self) -> Option { - // str::char::from_digit always returns lowercase. - const UPPER_HEX: [char; 16] = ['0', '1', '2', '3', '4', '5', '6', '7', - '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']; - match self.state { - PercentEncodeState::HexHigh(byte) => { - self.state = PercentEncodeState::HexLow(byte); - Some(UPPER_HEX[(byte >> 4) as usize]) - } - PercentEncodeState::HexLow(byte) => { - self.state = PercentEncodeState::NextByte; - Some(UPPER_HEX[(byte & 0x0F) as usize]) - } - PercentEncodeState::NextByte => { - self.iter.next().map(|&byte| { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if let Some((&first_byte, remaining)) = self.bytes.split_first() { + if self.encode_set.contains(first_byte) { + self.bytes = remaining; + Some(percent_encode_byte(first_byte)) + } else { + assert!(first_byte.is_ascii()); + for (i, &byte) in remaining.iter().enumerate() { if self.encode_set.contains(byte) { - self.state = PercentEncodeState::HexHigh(byte); - '%' + // 1 for first_byte + i for previous iterations of this loop + let (unchanged_slice, remaining) = self.bytes.split_at(1 + i); + self.bytes = remaining; + return Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) } else { assert!(byte.is_ascii()); - byte as char } - }) + } + let unchanged_slice = self.bytes; + self.bytes = &[][..]; + Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) } + } else { + None } } fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.iter.size_hint(); - (low.saturating_add(2) / 3, high) + if self.bytes.is_empty() { + (0, Some(0)) + } else { + (1, Some(self.bytes.len())) + } } } impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> { fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { for c in (*self).clone() { - try!(formatter.write_char(c)) + try!(formatter.write_str(c)) } Ok(()) } } +impl<'a, E: EncodeSet> From> for Cow<'a, str> { + fn from(mut iter: PercentEncode<'a, E>) -> Self { + match iter.next() { + None => "".into(), + Some(first) => { + match iter.next() { + None => first.into(), + Some(second) => { + let mut string = first.to_owned(); + string.push_str(second); + string.extend(iter); + string.into() + } + } + } + } + } +} + /// Percent-decode the given bytes and return an iterator of bytes. #[inline] pub fn percent_decode(input: &[u8]) -> PercentDecode { From 988494deafc851a76986c62e34637f31a3876da3 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 1 Apr 2016 18:37:31 +0200 Subject: [PATCH 52/89] Replace {lossy_,}utf8_percent_decode with percent_decode().decode_utf8{_lossy,} --- src/host.rs | 4 +- src/percent_encoding.rs | 109 ++++++++++++++++++++++++++++------------ 2 files changed, 79 insertions(+), 34 deletions(-) diff --git a/src/host.rs b/src/host.rs index cc789b24..f1889476 100644 --- a/src/host.rs +++ b/src/host.rs @@ -12,7 +12,7 @@ use std::io; use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs}; use std::vec; use parser::{ParseResult, ParseError}; -use percent_encoding::lossy_utf8_percent_decode; +use percent_encoding::percent_decode; use idna; #[derive(Copy, Clone, Debug)] @@ -77,7 +77,7 @@ impl Host { } return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6) } - let domain = lossy_utf8_percent_decode(input.as_bytes()); + let domain = percent_decode(input.as_bytes()).decode_utf8_lossy(); let domain = try!(idna::domain_to_ascii(&domain)); if domain.find(|c| matches!(c, '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']' diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index eb4deab9..10398e96 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -163,7 +163,7 @@ pub fn percent_encode(input: &[u8], encode_set: E) -> PercentEncod /// Percent-encode the UTF-8 encoding of the given string. /// -/// See `percent_decode()` for how to use the return value. +/// See `percent_encode()` for how to use the return value. #[inline] pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentEncode { percent_encode(input.as_bytes(), encode_set) @@ -242,62 +242,107 @@ impl<'a, E: EncodeSet> From> for Cow<'a, str> { } } -/// Percent-decode the given bytes and return an iterator of bytes. +/// Percent-decode the given bytes. +/// +/// The return value is an iterator of decoded `u8` bytes +/// that also implements `Into>` +/// (which returns `Cow::Borrowed` when `input` contains no percent-encoded sequence) +/// and has `decode_utf8()` and `decode_utf8_lossy()` methods. #[inline] pub fn percent_decode(input: &[u8]) -> PercentDecode { PercentDecode { - iter: input.iter() + bytes: input.iter() } } +/// The return type of `percent_decode()`. #[derive(Clone)] pub struct PercentDecode<'a> { - iter: slice::Iter<'a, u8>, + bytes: slice::Iter<'a, u8>, +} + +fn after_percent_sign(iter: &mut slice::Iter) -> Option { + let initial_iter = iter.clone(); + let h = iter.next().and_then(|&b| (b as char).to_digit(16)); + let l = iter.next().and_then(|&b| (b as char).to_digit(16)); + if let (Some(h), Some(l)) = (h, l) { + Some(h as u8 * 0x10 + l as u8) + } else { + *iter = initial_iter; + None + } } impl<'a> Iterator for PercentDecode<'a> { type Item = u8; fn next(&mut self) -> Option { - self.iter.next().map(|&byte| { + self.bytes.next().map(|&byte| { if byte == b'%' { - let after_percent_sign = self.iter.clone(); - let h = self.iter.next().and_then(|&b| (b as char).to_digit(16)); - let l = self.iter.next().and_then(|&b| (b as char).to_digit(16)); - if let (Some(h), Some(l)) = (h, l) { - return h as u8 * 0x10 + l as u8 - } - self.iter = after_percent_sign; + after_percent_sign(&mut self.bytes).unwrap_or(byte) + } else { + byte } - byte }) } fn size_hint(&self) -> (usize, Option) { - let (low, high) = self.iter.size_hint(); - (low, high.and_then(|high| high.checked_mul(3))) + let bytes = self.bytes.len(); + (bytes / 3, Some(bytes)) } } -/// Percent-decode the given bytes, and decode the result as UTF-8. -/// -/// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8. -pub fn utf8_percent_decode(input: &[u8]) -> Result { - let bytes = percent_decode(input).collect::>(); - String::from_utf8(bytes) +impl<'a> From> for Cow<'a, [u8]> { + fn from(mut iter: PercentDecode<'a>) -> Self { + let initial_bytes = iter.bytes.as_slice(); + while iter.bytes.find(|&&b| b == b'%').is_some() { + if let Some(decoded_byte) = after_percent_sign(&mut iter.bytes) { + let unchanged_bytes_len = initial_bytes.len() - iter.bytes.len() - 3; + let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned(); + decoded.push(decoded_byte); + decoded.extend(iter); + return decoded.into() + } + } + // Nothing to decode + initial_bytes.into() + } } -/// Percent-decode the given bytes, and decode the result as UTF-8. -/// -/// This is “lossy”: invalid UTF-8 percent-encoded byte sequences -/// will be replaced � U+FFFD, the replacement character. -pub fn lossy_utf8_percent_decode(input: &[u8]) -> String { - let bytes = percent_decode(input).collect::>(); - match String::from_utf8_lossy(&bytes) { - Cow::Owned(s) => return s, - Cow::Borrowed(_) => {} +impl<'a> PercentDecode<'a> { + /// Decode the result of percent-decoding as UTF-8. + /// + /// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8. + pub fn decode_utf8(self) -> Result, str::Utf8Error> { + match self.clone().into() { + Cow::Borrowed(bytes) => { + match str::from_utf8(bytes) { + Ok(s) => Ok(s.into()), + Err(e) => Err(e), + } + } + Cow::Owned(bytes) => { + match String::from_utf8(bytes) { + Ok(s) => Ok(s.into()), + Err(e) => Err(e.utf8_error()), + } + } + } } - unsafe { - String::from_utf8_unchecked(bytes) + + /// Decode the result of percent-decoding as UTF-8, lossily. + /// + /// Invalid UTF-8 percent-encoded byte sequences will be replaced � U+FFFD, + /// the replacement character. + pub fn decode_utf8_lossy(self) -> Cow<'a, str> { + match self.clone().into() { + Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), + Cow::Owned(bytes) => { + match String::from_utf8_lossy(&bytes) { + Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) }.into(), + Cow::Owned(s) => s.into(), + } + } + } } } From 31cdce5a520bdc9be069646cf0a3411889a9622b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 1 Apr 2016 19:48:40 +0200 Subject: [PATCH 53/89] form_urlencoded::parse returns an iterator. --- src/encoding.rs | 12 ++-- src/form_urlencoded.rs | 132 ++++++++++++++++++++++++++++------------ src/lib.rs | 4 +- src/percent_encoding.rs | 2 +- tests/tests.rs | 11 ++-- 5 files changed, 109 insertions(+), 52 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index be53ea19..1fd9c83a 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -43,6 +43,8 @@ impl EncodingOverride { } pub fn lookup(label: &[u8]) -> Option { + // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD + // https://encoding.spec.whatwg.org/#names-and-labels ::std::str::from_utf8(label) .ok() .and_then(encoding_from_whatwg_label) @@ -53,10 +55,10 @@ impl EncodingOverride { self.encoding.is_none() } - pub fn decode(&self, input: &[u8]) -> String { + pub fn decode<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { match self.encoding { - Some(encoding) => encoding.decode(input, DecoderTrap::Replace).unwrap(), - None => String::from_utf8_lossy(input).to_string(), + Some(encoding) => encoding.decode(input, DecoderTrap::Replace).unwrap().into(), + None => String::from_utf8_lossy(input), } } @@ -89,8 +91,8 @@ impl EncodingOverride { true } - pub fn decode(&self, input: &[u8]) -> String { - String::from_utf8_lossy(input).into_owned() + pub fn decode<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + String::from_utf8_lossy(input) } pub fn encode<'a>(&self, input: &'a str) -> Cow<'a, [u8]> { diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 68f967d6..ecd258bb 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -13,27 +13,30 @@ //! Converts between a string (such as an URL’s query string) //! and a sequence of (name, value) pairs. -use std::borrow::Borrow; use std::ascii::AsciiExt; +use std::borrow::{Borrow, Cow}; use encoding::EncodingOverride; use percent_encoding::{percent_encode, percent_decode, FORM_URLENCODED_ENCODE_SET}; /// Convert a byte string in the `application/x-www-form-urlencoded` format -/// into a vector of (name, value) pairs. +/// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. /// -/// The names and values are URL-decoded. For instance, `%23first=%25try%25` will be +/// The names and values are percent-decoded. For instance, `%23first=%25try%25` will be /// converted to `[("#first", "%try%")]`. #[inline] -pub fn parse(input: &[u8]) -> Vec<(String, String)> { - parse_internal(input, EncodingOverride::utf8(), false).unwrap() +pub fn parse(input: &[u8]) -> Parser { + Parser { + input: input, + encoding: EncodingOverride::utf8(), + } } /// Convert a byte string in the `application/x-www-form-urlencoded` format -/// into a vector of (name, value) pairs. +/// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. /// @@ -45,50 +48,101 @@ pub fn parse(input: &[u8]) -> Vec<(String, String)> { /// after percent-decoding. Defaults to UTF-8. /// * `use_charset`: The *use _charset_ flag*. If in doubt, set to `false`. #[cfg(feature = "query_encoding")] -#[inline] -pub fn parse_with_encoding(input: &[u8], encoding_override: Option<::encoding::EncodingRef>, - use_charset: bool) - -> Option> { - parse_internal(input, EncodingOverride::from_opt_encoding(encoding_override), use_charset) +pub fn parse_with_encoding<'a>(input: &'a [u8], + encoding_override: Option<::encoding::EncodingRef>, + use_charset: bool) + -> Result, ()> { + let mut encoding = EncodingOverride::from_opt_encoding(encoding_override); + if !(encoding.is_utf8() || input.is_ascii()) { + return Err(()) + } + if use_charset { + for sequence in input.split(|&b| b == b'&') { + // No '+' in "_charset_" to replace with ' '. + if sequence.starts_with(b"_charset_=") { + let value = &sequence[b"_charset_=".len()..]; + // Skip replacing '+' with ' ' in value since no encoding label contains either: + // https://encoding.spec.whatwg.org/#names-and-labels + if let Some(e) = EncodingOverride::lookup(value) { + encoding = e; + break + } + } + } + } + Ok(Parser { + input: input, + encoding: encoding, + }) } +/// The return type of `parse()`. +pub struct Parser<'a> { + input: &'a [u8], + encoding: EncodingOverride, +} -fn parse_internal(input: &[u8], mut encoding_override: EncodingOverride, mut use_charset: bool) - -> Option> { - let mut pairs = Vec::new(); - for piece in input.split(|&b| b == b'&') { - if !piece.is_empty() { - let (name, value) = match piece.iter().position(|b| *b == b'=') { - Some(position) => (&piece[..position], &piece[position + 1..]), - None => (piece, &[][..]) - }; +impl<'a> Iterator for Parser<'a> { + type Item = (Cow<'a, str>, Cow<'a, str>); - #[inline] - fn replace_plus(input: &[u8]) -> Vec { - input.iter().map(|&b| if b == b'+' { b' ' } else { b }).collect() + fn next(&mut self) -> Option { + loop { + if self.input.is_empty() { + return None } - - let name = replace_plus(name); - let value = replace_plus(value); - if use_charset && name == b"_charset_" { - if let Some(encoding) = EncodingOverride::lookup(&value) { - encoding_override = encoding; - } - use_charset = false; + let mut split2 = self.input.splitn(2, |&b| b == b'&'); + let sequence = split2.next().unwrap(); + self.input = split2.next().unwrap_or(&[][..]); + if sequence.is_empty() { + continue } - pairs.push((name, value)); + let mut split2 = sequence.splitn(2, |&b| b == b'='); + let name = split2.next().unwrap(); + let value = split2.next().unwrap_or(&[][..]); + return Some(( + decode(name, self.encoding), + decode(value, self.encoding), + )) } } - if !(encoding_override.is_utf8() || input.is_ascii()) { - return None - } +} - Some(pairs.into_iter().map(|(name, value)| ( - encoding_override.decode(&percent_decode(&name).collect::>()), - encoding_override.decode(&percent_decode(&value).collect::>()), - )).collect()) +/// * Replace b'+' with b' ' +/// * Then percent-decode +/// * Then decode with `encoding` +fn decode<'a>(input: &'a [u8], encoding: EncodingOverride) -> Cow<'a, str> { + // The return value can borrow `input` but not an intermediate Cow, + // so we need to return Owned if either of the intermediate Cow is Owned + match replace_plus(input) { + Cow::Owned(replaced) => { + let decoded: Cow<_> = percent_decode(&replaced).into(); + encoding.decode(&decoded).into_owned().into() + } + Cow::Borrowed(replaced) => { + match percent_decode(replaced).into() { + Cow::Owned(decoded) => encoding.decode(&decoded).into_owned().into(), + Cow::Borrowed(decoded) => encoding.decode(decoded), + } + } + } } +/// Replace b'+' with b' ' +fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { + match input.iter().position(|&b| b == b'+') { + None => input.into(), + Some(first_position) => { + let mut replaced = input.to_owned(); + replaced[first_position] = b' '; + for byte in &mut replaced[first_position + 1..] { + if *byte == b'+' { + *byte = b' '; + } + } + replaced.into() + } + } +} /// Convert an iterator of (name, value) pairs /// into a string in the `application/x-www-form-urlencoded` format. diff --git a/src/lib.rs b/src/lib.rs index 38567c72..2673d37e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -988,9 +988,9 @@ impl Url { } /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` - /// and return a vector of (key, value) pairs. + /// and return an iterator of (key, value) pairs. #[inline] - pub fn query_pairs(&self) -> Option> { + pub fn query_pairs(&self) -> Option { self.query().map(|query| form_urlencoded::parse(query.as_bytes())) } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 10398e96..bb3db07f 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -249,7 +249,7 @@ impl<'a, E: EncodeSet> From> for Cow<'a, str> { /// (which returns `Cow::Borrowed` when `input` contains no percent-encoded sequence) /// and has `decode_utf8()` and `decode_utf8_lossy()` methods. #[inline] -pub fn percent_decode(input: &[u8]) -> PercentDecode { +pub fn percent_decode<'a>(input: &'a [u8]) -> PercentDecode<'a> { PercentDecode { bytes: input.iter() } diff --git a/tests/tests.rs b/tests/tests.rs index fe964579..36882689 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -8,6 +8,7 @@ extern crate url; +use std::borrow::Cow; use std::net::{Ipv4Addr, Ipv6Addr}; use std::path::{Path, PathBuf}; use url::{Host, Url}; @@ -213,14 +214,14 @@ fn test_serialization() { fn test_form_urlencoded() { use url::form_urlencoded::*; - let pairs = &[ - ("foo".to_string(), "é&".to_string()), - ("bar".to_string(), "".to_string()), - ("foo".to_string(), "#".to_string()) + let pairs: &[(Cow, Cow)] = &[ + ("foo".into(), "é&".into()), + ("bar".into(), "".into()), + ("foo".into(), "#".into()) ]; let encoded = serialize(pairs); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); - assert_eq!(parse(encoded.as_bytes()), pairs.to_vec()); + assert_eq!(parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); } #[test] From 837a8da8f9393139a576fd55070e45469993a639 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 11:50:29 +0200 Subject: [PATCH 54/89] Introduce "output encoding", per spec. --- src/encoding.rs | 14 ++++++++++++++ src/form_urlencoded.rs | 2 +- src/lib.rs | 2 +- 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 1fd9c83a..63f59514 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -51,6 +51,16 @@ impl EncodingOverride { .map(EncodingOverride::from_encoding) } + /// https://encoding.spec.whatwg.org/#get-an-output-encoding + pub fn to_output_encoding(self) -> Self { + if let Some(encoding) = self.encoding { + if matches!(encoding.name(), "utf-16le" | "utf-16be") { + return Self::utf8() + } + } + self + } + pub fn is_utf8(&self) -> bool { self.encoding.is_none() } @@ -87,6 +97,10 @@ impl EncodingOverride { None } + pub fn to_output_encoding(self) -> Self { + self + } + pub fn is_utf8(&self) -> bool { true } diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index ecd258bb..b79085f1 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -167,7 +167,7 @@ pub fn serialize_with_encoding(pairs: I, encoding_override: Option<::encoding::EncodingRef>) -> String where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - serialize_internal(pairs, EncodingOverride::from_opt_encoding(encoding_override)) + serialize_internal(pairs, EncodingOverride::from_opt_encoding(encoding_override).to_output_encoding()) } fn serialize_internal(pairs: I, encoding_override: EncodingOverride) -> String diff --git a/src/lib.rs b/src/lib.rs index 2673d37e..646b23a1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -200,7 +200,7 @@ impl<'a> ParseOptions<'a> { /// This is a legacy concept only relevant for HTML. #[cfg(feature = "query_encoding")] pub fn encoding_override(mut self, new: Option) -> Self { - self.encoding_override = EncodingOverride::from_opt_encoding(new); + self.encoding_override = EncodingOverride::from_opt_encoding(new).to_output_encoding(); self } From 507ff3f80f95c6ac53ccd72e5cac2a5fcdf3b6d6 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 11:51:30 +0200 Subject: [PATCH 55/89] Self --- src/encoding.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 63f59514..17938298 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -27,28 +27,28 @@ pub struct EncodingOverride { #[cfg(feature = "query_encoding")] impl EncodingOverride { - pub fn from_opt_encoding(encoding: Option) -> EncodingOverride { - encoding.map(EncodingOverride::from_encoding).unwrap_or_else(EncodingOverride::utf8) + pub fn from_opt_encoding(encoding: Option) -> Self { + encoding.map(Self::from_encoding).unwrap_or_else(Self::utf8) } - pub fn from_encoding(encoding: EncodingRef) -> EncodingOverride { + pub fn from_encoding(encoding: EncodingRef) -> Self { EncodingOverride { encoding: if encoding.name() == "utf-8" { None } else { Some(encoding) } } } #[inline] - pub fn utf8() -> EncodingOverride { + pub fn utf8() -> Self { EncodingOverride { encoding: None } } - pub fn lookup(label: &[u8]) -> Option { + pub fn lookup(label: &[u8]) -> Option { // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD // https://encoding.spec.whatwg.org/#names-and-labels ::std::str::from_utf8(label) .ok() .and_then(encoding_from_whatwg_label) - .map(EncodingOverride::from_encoding) + .map(Self::from_encoding) } /// https://encoding.spec.whatwg.org/#get-an-output-encoding @@ -89,11 +89,11 @@ pub struct EncodingOverride; #[cfg(not(feature = "query_encoding"))] impl EncodingOverride { #[inline] - pub fn utf8() -> EncodingOverride { + pub fn utf8() -> Self { EncodingOverride } - pub fn lookup(_label: &[u8]) -> Option { + pub fn lookup(_label: &[u8]) -> Option { None } From 8e6418a3fbcbefea57d1b9dd8d1340445a963d4c Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 11:52:08 +0200 Subject: [PATCH 56/89] =?UTF-8?q?Don=E2=80=99t=20take=20time=20to=20run=20?= =?UTF-8?q?0=20in-crate=20unit=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Cargo.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Cargo.toml b/Cargo.toml index b65dc40c..d396b717 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,6 +18,9 @@ name = "tests" name = "wpt" harness = false +[lib] +test = false + [dev-dependencies] rustc-test = "0.1" rustc-serialize = "0.3" From d3e9824a21bbb6458bd8059d7a2fb9fd79f52912 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 12:36:14 +0200 Subject: [PATCH 57/89] Cow wrangling. (Reduce allocations/copying.) --- src/encoding.rs | 40 ++++++++++++++++++++++++++++----------- src/form_urlencoded.rs | 26 +++++++------------------ src/parser.rs | 2 +- src/percent_encoding.rs | 42 ++++++++++++++++++++++------------------- 4 files changed, 60 insertions(+), 50 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 17938298..718d53cd 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -65,18 +65,17 @@ impl EncodingOverride { self.encoding.is_none() } - pub fn decode<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { match self.encoding { - Some(encoding) => encoding.decode(input, DecoderTrap::Replace).unwrap().into(), - None => String::from_utf8_lossy(input), + Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), + None => decode_utf8_lossy(input), } } - pub fn encode<'a>(&self, input: &'a str) -> Cow<'a, [u8]> { + pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { match self.encoding { - Some(encoding) => Cow::Owned( - encoding.encode(input, EncoderTrap::NcrEscape).unwrap()), - None => Cow::Borrowed(input.as_bytes()), // UTF-8 + Some(encoding) => encoding.encode(&input, EncoderTrap::NcrEscape).unwrap().into(), + None => encode_utf8(input) } } } @@ -105,11 +104,30 @@ impl EncodingOverride { true } - pub fn decode<'a>(&self, input: &'a [u8]) -> Cow<'a, str> { - String::from_utf8_lossy(input) + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + decode_utf8_lossy(input) } - pub fn encode<'a>(&self, input: &'a str) -> Cow<'a, [u8]> { - Cow::Borrowed(input.as_bytes()) + pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + encode_utf8(input) + } +} + +pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { + match input { + Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), + Cow::Owned(bytes) => { + match String::from_utf8_lossy(&bytes) { + Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) }.into(), + Cow::Owned(s) => s.into(), + } + } + } +} + +pub fn encode_utf8(input: Cow) -> Cow<[u8]> { + match input { + Cow::Borrowed(s) => s.as_bytes().into(), + Cow::Owned(s) => s.into_bytes().into() } } diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index b79085f1..4d19499b 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -107,24 +107,12 @@ impl<'a> Iterator for Parser<'a> { } } -/// * Replace b'+' with b' ' -/// * Then percent-decode -/// * Then decode with `encoding` -fn decode<'a>(input: &'a [u8], encoding: EncodingOverride) -> Cow<'a, str> { - // The return value can borrow `input` but not an intermediate Cow, - // so we need to return Owned if either of the intermediate Cow is Owned - match replace_plus(input) { - Cow::Owned(replaced) => { - let decoded: Cow<_> = percent_decode(&replaced).into(); - encoding.decode(&decoded).into_owned().into() - } - Cow::Borrowed(replaced) => { - match percent_decode(replaced).into() { - Cow::Owned(decoded) => encoding.decode(&decoded).into_owned().into(), - Cow::Borrowed(decoded) => encoding.decode(decoded), - } - } - } +fn decode(input: &[u8], encoding: EncodingOverride) -> Cow { + let replaced = replace_plus(input); + encoding.decode(match percent_decode(&replaced).if_any() { + Some(vec) => vec.into(), + None => replaced, + }) } /// Replace b'+' with b' ' @@ -175,7 +163,7 @@ where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { #[inline] fn byte_serialize(input: &str, output: &mut String, encoding_override: EncodingOverride) { - for &byte in encoding_override.encode(input).iter() { + for &byte in encoding_override.encode(input.into()).iter() { if byte == b' ' { output.push_str("+") } else { diff --git a/src/parser.rs b/src/parser.rs index d52ad182..8ecc790a 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -945,7 +945,7 @@ impl<'a> Parser<'a> { "http" | "https" | "file" | "ftp" | "gopher" => self.query_encoding_override, _ => EncodingOverride::utf8(), }; - let query_bytes = encoding.encode(&query); + let query_bytes = encoding.encode(query.into()); self.serialization.extend(percent_encode(&query_bytes, QUERY_ENCODE_SET)); remaining } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index bb3db07f..c1c79473 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -6,6 +6,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +use encoding; use std::ascii::AsciiExt; use std::borrow::Cow; use std::fmt::{self, Write}; @@ -293,23 +294,34 @@ impl<'a> Iterator for PercentDecode<'a> { } impl<'a> From> for Cow<'a, [u8]> { - fn from(mut iter: PercentDecode<'a>) -> Self { - let initial_bytes = iter.bytes.as_slice(); - while iter.bytes.find(|&&b| b == b'%').is_some() { - if let Some(decoded_byte) = after_percent_sign(&mut iter.bytes) { - let unchanged_bytes_len = initial_bytes.len() - iter.bytes.len() - 3; + fn from(iter: PercentDecode<'a>) -> Self { + match iter.if_any() { + Some(vec) => vec.into(), + None => iter.bytes.as_slice().into(), + } + } +} + +impl<'a> PercentDecode<'a> { + /// If the percent-decoding is different from the input, return it as a new bytes vector. + pub fn if_any(&self) -> Option> { + let mut bytes_iter = self.bytes.clone(); + while bytes_iter.find(|&&b| b == b'%').is_some() { + if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) { + let initial_bytes = self.bytes.as_slice(); + let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - 3; let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned(); decoded.push(decoded_byte); - decoded.extend(iter); - return decoded.into() + decoded.extend(PercentDecode { + bytes: bytes_iter + }); + return Some(decoded) } } // Nothing to decode - initial_bytes.into() + None } -} -impl<'a> PercentDecode<'a> { /// Decode the result of percent-decoding as UTF-8. /// /// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8. @@ -335,14 +347,6 @@ impl<'a> PercentDecode<'a> { /// Invalid UTF-8 percent-encoded byte sequences will be replaced � U+FFFD, /// the replacement character. pub fn decode_utf8_lossy(self) -> Cow<'a, str> { - match self.clone().into() { - Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), - Cow::Owned(bytes) => { - match String::from_utf8_lossy(&bytes) { - Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) }.into(), - Cow::Owned(s) => s.into(), - } - } - } + encoding::decode_utf8_lossy(self.clone().into()) } } From bea7a484aff1bf9ddafe51fb619064975cab10d1 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 17:58:20 +0200 Subject: [PATCH 58/89] form_urlencoded::Serializer is a "stateful" object. --- src/encoding.rs | 19 ++--- src/form_urlencoded.rs | 171 ++++++++++++++++++++++++++++------------ src/lib.rs | 4 +- src/percent_encoding.rs | 8 -- tests/tests.rs | 28 +++---- 5 files changed, 141 insertions(+), 89 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 718d53cd..42f1a249 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -65,6 +65,13 @@ impl EncodingOverride { self.encoding.is_none() } + pub fn name(&self) -> &'static str { + match self.encoding { + Some(encoding) => encoding.name(), + None => "utf-8", + } + } + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { match self.encoding { Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), @@ -92,18 +99,6 @@ impl EncodingOverride { EncodingOverride } - pub fn lookup(_label: &[u8]) -> Option { - None - } - - pub fn to_output_encoding(self) -> Self { - self - } - - pub fn is_utf8(&self) -> bool { - true - } - pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { decode_utf8_lossy(input) } diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 4d19499b..bfdc687a 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -13,10 +13,10 @@ //! Converts between a string (such as an URL’s query string) //! and a sequence of (name, value) pairs. -use std::ascii::AsciiExt; -use std::borrow::{Borrow, Cow}; use encoding::EncodingOverride; -use percent_encoding::{percent_encode, percent_decode, FORM_URLENCODED_ENCODE_SET}; +use percent_encoding::{percent_encode_byte, percent_decode}; +use std::borrow::{Borrow, Cow}; +use std::str; /// Convert a byte string in the `application/x-www-form-urlencoded` format @@ -27,8 +27,8 @@ use percent_encoding::{percent_encode, percent_decode, FORM_URLENCODED_ENCODE_SE /// The names and values are percent-decoded. For instance, `%23first=%25try%25` will be /// converted to `[("#first", "%try%")]`. #[inline] -pub fn parse(input: &[u8]) -> Parser { - Parser { +pub fn parse(input: &[u8]) -> Parse { + Parse { input: input, encoding: EncodingOverride::utf8(), } @@ -51,7 +51,9 @@ pub fn parse(input: &[u8]) -> Parser { pub fn parse_with_encoding<'a>(input: &'a [u8], encoding_override: Option<::encoding::EncodingRef>, use_charset: bool) - -> Result, ()> { + -> Result, ()> { + use std::ascii::AsciiExt; + let mut encoding = EncodingOverride::from_opt_encoding(encoding_override); if !(encoding.is_utf8() || input.is_ascii()) { return Err(()) @@ -70,19 +72,19 @@ pub fn parse_with_encoding<'a>(input: &'a [u8], } } } - Ok(Parser { + Ok(Parse { input: input, encoding: encoding, }) } /// The return type of `parse()`. -pub struct Parser<'a> { +pub struct Parse<'a> { input: &'a [u8], encoding: EncodingOverride, } -impl<'a> Iterator for Parser<'a> { +impl<'a> Iterator for Parse<'a> { type Item = (Cow<'a, str>, Cow<'a, str>); fn next(&mut self) -> Option { @@ -132,55 +134,124 @@ fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { } } -/// Convert an iterator of (name, value) pairs -/// into a string in the `application/x-www-form-urlencoded` format. -#[inline] -pub fn serialize(pairs: I) -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - serialize_internal(pairs, EncodingOverride::utf8()) +/// The [`application/x-www-form-urlencoded` byte serializer]( +/// https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer). +/// +/// Return an iterator of `&str` slices. +pub fn byte_serialize(input: &[u8]) -> ByteSerialize { + ByteSerialize { + bytes: input, + } } -/// Convert an iterator of (name, value) pairs -/// into a string in the `application/x-www-form-urlencoded` format. -/// -/// This function is only available if the `query_encoding` Cargo feature is enabled. -/// -/// Arguments: -/// -/// * `encoding_override`: The character encoding each name and values is encoded as -/// before percent-encoding. Defaults to UTF-8. -#[cfg(feature = "query_encoding")] -#[inline] -pub fn serialize_with_encoding(pairs: I, - encoding_override: Option<::encoding::EncodingRef>) - -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - serialize_internal(pairs, EncodingOverride::from_opt_encoding(encoding_override).to_output_encoding()) +/// Return value of `byte_serialize()`. +pub struct ByteSerialize<'a> { + bytes: &'a [u8], +} + +fn byte_serialized_unchanged(byte: u8) -> bool { + matches!(byte, b'*' | b'-' | b'.' | b'0' ... b'9' | b'A' ... b'Z' | b'_' | b'a' ... b'z') } -fn serialize_internal(pairs: I, encoding_override: EncodingOverride) -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - #[inline] - fn byte_serialize(input: &str, output: &mut String, - encoding_override: EncodingOverride) { - for &byte in encoding_override.encode(input.into()).iter() { - if byte == b' ' { - output.push_str("+") - } else { - output.extend(percent_encode(&[byte], FORM_URLENCODED_ENCODE_SET)) +impl<'a> Iterator for ByteSerialize<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if let Some((&first, tail)) = self.bytes.split_first() { + if !byte_serialized_unchanged(first) { + self.bytes = tail; + return Some(if first == b' ' { "+" } else { percent_encode_byte(first) }) } + let position = tail.iter().position(|&b| !byte_serialized_unchanged(b)); + let (unchanged_slice, remaining) = match position { + // 1 for first_byte + i unchanged in tail + Some(i) => self.bytes.split_at(1 + i), + None => (self.bytes, &[][..]), + }; + self.bytes = remaining; + Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) + } else { + None } } - let mut output = String::new(); - for pair in pairs { - let &(ref name, ref value) = pair.borrow(); - if !output.is_empty() { - output.push_str("&"); + fn size_hint(&self) -> (usize, Option) { + if self.bytes.is_empty() { + (0, Some(0)) + } else { + (1, Some(self.bytes.len())) } - byte_serialize(name.as_ref(), &mut output, encoding_override); - output.push_str("="); - byte_serialize(value.as_ref(), &mut output, encoding_override); } - output +} + +/// The [`application/x-www-form-urlencoded` serializer]( +/// https://url.spec.whatwg.org/#concept-urlencoded-serializer). +pub struct Serializer<'a> { + string: &'a mut String, + start_position: usize, + encoding: EncodingOverride, +} + +impl<'a> Serializer<'a> { + /// Create a new `application/x-www-form-urlencoded` serializer + /// for the given range of the given string. + /// + /// If the range is non-empty, the corresponding slice of the string is assumed + /// to already be in `application/x-www-form-urlencoded` format. + pub fn new(string: &'a mut String, start_position: usize) -> Self { + &string[start_position..]; // Panic if out of bounds + Serializer { + string: string, + start_position: start_position, + encoding: EncodingOverride::utf8(), + } + } + + /// Remove any existing name/value pair. + pub fn clear(&mut self) { + self.string.truncate(self.start_position) + } + + /// Set the character encoding to be used for names and values before percent-encoding. + #[cfg(feature = "query_encoding")] + pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) { + self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding();; + } + + fn append_separator_if_needed(&mut self) { + if self.string.len() > self.start_position { + self.string.push('&') + } + } + + /// Serialize and append a name/value pair. + pub fn append_pair(&mut self, name: &str, value: &str) { + self.append_separator_if_needed(); + self.string.extend(byte_serialize(&self.encoding.encode(name.into()))); + self.string.push('='); + self.string.extend(byte_serialize(&self.encoding.encode(value.into()))); + } + + /// Serialize and append a number of name/value pairs. + /// + /// This simply calls `append_pair` repeatedly. + /// This can be more convenient, so the user doesn’t need to introduce a block + /// to limit the scope of `Serializer`’s borrow of its string. + pub fn append_pairs(&mut self, iter: I) + where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { + for pair in iter { + let &(ref k, ref v) = pair.borrow(); + self.append_pair(k.as_ref(), v.as_ref()) + } + } + + /// Add a name/value pair whose name is `_charset_` + /// and whose value is the character encoding’s name. + /// (See the `encoding_override()` method.) + #[cfg(feature = "query_encoding")] + pub fn append_charset(&mut self) { + self.append_separator_if_needed(); + self.string.push_str("_charset_="); + self.string.push_str(self.encoding.name()); + } } diff --git a/src/lib.rs b/src/lib.rs index 646b23a1..5d5a7a38 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -990,8 +990,8 @@ impl Url { /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` /// and return an iterator of (key, value) pairs. #[inline] - pub fn query_pairs(&self) -> Option { - self.query().map(|query| form_urlencoded::parse(query.as_bytes())) + pub fn query_pairs(&self) -> form_urlencoded::Parse { + form_urlencoded::parse(self.query().unwrap_or("").as_bytes()) } // Private helper methods: diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index c1c79473..1716a91e 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -110,14 +110,6 @@ define_encode_set! { } } -define_encode_set! { - /// This encode set is used in `application/x-www-form-urlencoded` serialization. - pub FORM_URLENCODED_ENCODE_SET = [SIMPLE_ENCODE_SET] | { - ' ', '!', '"', '#', '$', '%', '&', '\'', '(', ')', '+', ',', '/', ':', ';', - '<', '=', '>', '?', '@', '[', '\\', ']', '^', '`', '{', '|', '}', '~' - } -} - /// Return the percent-encoding of the given bytes. /// /// This is unconditional, unlike `percent_encode()` which uses an encode set. diff --git a/tests/tests.rs b/tests/tests.rs index 36882689..1024f4b9 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -11,7 +11,7 @@ extern crate url; use std::borrow::Cow; use std::net::{Ipv4Addr, Ipv6Addr}; use std::path::{Path, PathBuf}; -use url::{Host, Url}; +use url::{Host, Url, form_urlencoded}; macro_rules! assert_from_file_path { ($path: expr) => { assert_from_file_path!($path, $path) }; @@ -212,30 +212,24 @@ fn test_serialization() { #[test] fn test_form_urlencoded() { - use url::form_urlencoded::*; - let pairs: &[(Cow, Cow)] = &[ ("foo".into(), "é&".into()), ("bar".into(), "".into()), ("foo".into(), "#".into()) ]; - let encoded = serialize(pairs); + let mut encoded = String::new(); + form_urlencoded::Serializer::new(&mut encoded, 0).append_pairs(pairs); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); - assert_eq!(parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); + assert_eq!(form_urlencoded::parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); } #[test] fn test_form_serialize() { - use url::form_urlencoded::*; - - let pairs = [("foo", "é&"), - ("bar", ""), - ("foo", "#")]; - - let want = "foo=%C3%A9%26&bar=&foo=%23"; - // Works with referenced tuples - assert_eq!(serialize(pairs.iter()), want); - // Works with owned tuples - assert_eq!(serialize(pairs.iter().map(|p| (p.0, p.1))), want); - + let mut encoded = String::new(); + form_urlencoded::Serializer::new(&mut encoded, 0).append_pairs(&[ + ("foo", "é&"), + ("bar", ""), + ("foo", "#") + ]); + assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); } From 6f716e1830238908d3236b7cdef7ebf57c45b829 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 18:26:09 +0200 Subject: [PATCH 59/89] Add Url::mutate_query_pairs --- src/form_urlencoded.rs | 8 ++--- src/lib.rs | 80 ++++++++++++++++++++++++++++++++---------- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index bfdc687a..aaf94a9d 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -6,7 +6,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Parser and serializer for the [`application/x-www-form-urlencoded` format]( +//! Parser and serializer for the [`application/x-www-form-urlencoded` syntax]( //! http://url.spec.whatwg.org/#application/x-www-form-urlencoded), //! as used by HTML forms. //! @@ -19,7 +19,7 @@ use std::borrow::{Borrow, Cow}; use std::str; -/// Convert a byte string in the `application/x-www-form-urlencoded` format +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax /// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. @@ -35,7 +35,7 @@ pub fn parse(input: &[u8]) -> Parse { } -/// Convert a byte string in the `application/x-www-form-urlencoded` format +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax /// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. @@ -197,7 +197,7 @@ impl<'a> Serializer<'a> { /// for the given range of the given string. /// /// If the range is non-empty, the corresponding slice of the string is assumed - /// to already be in `application/x-www-form-urlencoded` format. + /// to already be in `application/x-www-form-urlencoded` syntax. pub fn new(string: &'a mut String, start_position: usize) -> Self { &string[start_position..]; // Panic if out of bounds Serializer { diff --git a/src/lib.rs b/src/lib.rs index 5d5a7a38..1fa357a7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -450,6 +450,13 @@ impl Url { } } + /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` + /// and return an iterator of (key, value) pairs. + #[inline] + pub fn query_pairs(&self) -> form_urlencoded::Parse { + form_urlencoded::parse(self.query().unwrap_or("").as_bytes()) + } + /// Return this URL’s fragment identifier, if any. /// /// **Note:** the parser did *not* percent-encode this component, @@ -487,24 +494,66 @@ impl Url { /// Change this URL’s query string. pub fn set_query(&mut self, query: Option<&str>) { + self.set_query_internal(|url| { + // Remove any previous query + if let Some(start) = url.query_start.take() { + debug_assert!(url.byte_at(start) == b'?'); + url.serialization.truncate(start as usize); + } + // Write the new query, if any + if let Some(input) = query { + url.query_start = Some(to_u32(url.serialization.len()).unwrap()); + url.serialization.push('?'); + let scheme_end = url.scheme_end; + url.mutate(|parser| parser.parse_query(scheme_end, input)); + } + }) + } + + /// Change this URL’s query string, viewed as a sequence of name/value pairs + /// in `application/x-www-form-urlencoded` syntax. + /// + /// Example: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://example.net?...#nav").unwrap(); + /// assert_eq!(url.query(), Some("...")); + /// url.mutate_query_pairs(|query| { + /// query.clear(); + /// query.append_pair("foo", "bar & baz"); + /// }); + /// assert_eq!(url.query(), Some("foo=bar+%26+baz")); + /// assert_eq!(url.as_str(), "https://example.net/?foo=bar+%26+baz#nav"); + /// url.mutate_query_pairs(|query| { + /// query.append_pair("saison", "Été+hiver"); + /// }); + /// assert_eq!(url.query(), Some("foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver")); + /// ``` + pub fn mutate_query_pairs(&mut self, f: F) { + self.set_query_internal(|url| { + let query_start; + if let Some(start) = url.query_start { + debug_assert!(url.byte_at(start) == b'?'); + query_start = start as usize; + } else { + query_start = url.serialization.len(); + url.query_start = Some(to_u32(query_start).unwrap()); + url.serialization.push('?'); + } + let query_start = query_start + "?".len(); + f(&mut form_urlencoded::Serializer::new(&mut url.serialization, query_start)) + }) + } + + fn set_query_internal(&mut self, f: F) { // Stash any fragment let fragment = self.fragment_start.map(|start| { let f = self.slice(start..).to_owned(); self.serialization.truncate(start as usize); f }); - // Remove any previous query - if let Some(start) = self.query_start { - debug_assert!(self.byte_at(start) == b'?'); - self.serialization.truncate(start as usize); - } - // Write the new one - if let Some(input) = query { - self.query_start = Some(to_u32(self.serialization.len()).unwrap()); - self.serialization.push('?'); - let scheme_end = self.scheme_end; - self.mutate(|parser| parser.parse_query(scheme_end, input)); - } + f(self); // Restore the fragment, if any if let Some(ref fragment) = fragment { self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); @@ -987,13 +1036,6 @@ impl Url { Err(()) } - /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` - /// and return an iterator of (key, value) pairs. - #[inline] - pub fn query_pairs(&self) -> form_urlencoded::Parse { - form_urlencoded::parse(self.query().unwrap_or("").as_bytes()) - } - // Private helper methods: #[inline] From 125cf96db14843ed47402ceab2f8320991d07904 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 19:42:11 +0200 Subject: [PATCH 60/89] Method chaining --- src/form_urlencoded.rs | 21 +++++++++++++-------- src/lib.rs | 2 +- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index aaf94a9d..ec334940 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -208,14 +208,16 @@ impl<'a> Serializer<'a> { } /// Remove any existing name/value pair. - pub fn clear(&mut self) { - self.string.truncate(self.start_position) + pub fn clear(&mut self) -> &mut Self { + self.string.truncate(self.start_position); + self } /// Set the character encoding to be used for names and values before percent-encoding. #[cfg(feature = "query_encoding")] - pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) { - self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding();; + pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) -> &mut Self { + self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self } fn append_separator_if_needed(&mut self) { @@ -225,11 +227,12 @@ impl<'a> Serializer<'a> { } /// Serialize and append a name/value pair. - pub fn append_pair(&mut self, name: &str, value: &str) { + pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self { self.append_separator_if_needed(); self.string.extend(byte_serialize(&self.encoding.encode(name.into()))); self.string.push('='); self.string.extend(byte_serialize(&self.encoding.encode(value.into()))); + self } /// Serialize and append a number of name/value pairs. @@ -237,21 +240,23 @@ impl<'a> Serializer<'a> { /// This simply calls `append_pair` repeatedly. /// This can be more convenient, so the user doesn’t need to introduce a block /// to limit the scope of `Serializer`’s borrow of its string. - pub fn append_pairs(&mut self, iter: I) + pub fn append_pairs(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { for pair in iter { let &(ref k, ref v) = pair.borrow(); - self.append_pair(k.as_ref(), v.as_ref()) + self.append_pair(k.as_ref(), v.as_ref()); } + self } /// Add a name/value pair whose name is `_charset_` /// and whose value is the character encoding’s name. /// (See the `encoding_override()` method.) #[cfg(feature = "query_encoding")] - pub fn append_charset(&mut self) { + pub fn append_charset(&mut self) -> &mut Self { self.append_separator_if_needed(); self.string.push_str("_charset_="); self.string.push_str(self.encoding.name()); + self } } diff --git a/src/lib.rs b/src/lib.rs index 1fa357a7..0d2e1550 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -123,7 +123,7 @@ assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") #[cfg(feature="serde")] extern crate serde; #[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; -extern crate idna; +pub extern crate idna; use encoding::EncodingOverride; use host::HostInternal; From 2375850090fe7fee77a3b68e604a433e4cb2c9e4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sat, 2 Apr 2016 20:06:57 +0200 Subject: [PATCH 61/89] Backport to Rust 1.7 --- src/encoding.rs | 6 +++--- src/form_urlencoded.rs | 6 +++--- src/percent_encoding.rs | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 42f1a249..4564e340 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -81,7 +81,7 @@ impl EncodingOverride { pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { match self.encoding { - Some(encoding) => encoding.encode(&input, EncoderTrap::NcrEscape).unwrap().into(), + Some(encoding) => Cow::Owned(encoding.encode(&input, EncoderTrap::NcrEscape).unwrap()), None => encode_utf8(input) } } @@ -122,7 +122,7 @@ pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { pub fn encode_utf8(input: Cow) -> Cow<[u8]> { match input { - Cow::Borrowed(s) => s.as_bytes().into(), - Cow::Owned(s) => s.into_bytes().into() + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()) } } diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index ec334940..9d7ca436 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -112,7 +112,7 @@ impl<'a> Iterator for Parse<'a> { fn decode(input: &[u8], encoding: EncodingOverride) -> Cow { let replaced = replace_plus(input); encoding.decode(match percent_decode(&replaced).if_any() { - Some(vec) => vec.into(), + Some(vec) => Cow::Owned(vec), None => replaced, }) } @@ -120,7 +120,7 @@ fn decode(input: &[u8], encoding: EncodingOverride) -> Cow { /// Replace b'+' with b' ' fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { match input.iter().position(|&b| b == b'+') { - None => input.into(), + None => Cow::Borrowed(input), Some(first_position) => { let mut replaced = input.to_owned(); replaced[first_position] = b' '; @@ -129,7 +129,7 @@ fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { *byte = b' '; } } - replaced.into() + Cow::Owned(replaced) } } } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 1716a91e..588b4ba2 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -288,8 +288,8 @@ impl<'a> Iterator for PercentDecode<'a> { impl<'a> From> for Cow<'a, [u8]> { fn from(iter: PercentDecode<'a>) -> Self { match iter.if_any() { - Some(vec) => vec.into(), - None => iter.bytes.as_slice().into(), + Some(vec) => Cow::Owned(vec), + None => Cow::Borrowed(iter.bytes.as_slice()), } } } From 6c6386d488c634cf846a2ff2fe75c44fc35d8125 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sun, 3 Apr 2016 12:40:09 +0200 Subject: [PATCH 62/89] Url::mutate_query_pairs return a value with Drop rather than take a closure. --- src/form_urlencoded.rs | 133 +++++++++++++++++++++++++++++++-------- src/lib.rs | 139 ++++++++++++++++++++++++----------------- tests/tests.rs | 14 ++--- 3 files changed, 196 insertions(+), 90 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 9d7ca436..af3ce787 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -186,30 +186,74 @@ impl<'a> Iterator for ByteSerialize<'a> { /// The [`application/x-www-form-urlencoded` serializer]( /// https://url.spec.whatwg.org/#concept-urlencoded-serializer). -pub struct Serializer<'a> { - string: &'a mut String, +pub struct Serializer { + target: Option, start_position: usize, encoding: EncodingOverride, } -impl<'a> Serializer<'a> { +pub trait Target { + fn as_mut_string(&mut self) -> &mut String; + fn finish(self) -> Self::Finished; + type Finished; +} + +impl Target for String { + fn as_mut_string(&mut self) -> &mut String { self } + fn finish(self) -> Self { self } + type Finished = Self; +} + +impl<'a> Target for &'a mut String { + fn as_mut_string(&mut self) -> &mut String { &mut **self } + fn finish(self) -> Self { self } + type Finished = Self; +} + +// `as_mut_string` string here exposes the internal serialization of an `Url`, +// which should not be exposed to users. +// We achieve that by not giving users direct access to `UrlQuery`: +// * Its fields are private +// (and so can not be constructed with struct literal syntax outside of this crate), +// * It has no constructor +// * It is only visible (on the type level) to users in the return type of +// `Url::mutate_query_pairs` which is `Serializer` +// * `Serializer` keeps its target in a private field +// * Unlike in other `Target` impls, `UrlQuery::finished` does not return `Self`. +impl<'a> Target for ::UrlQuery<'a> { + fn as_mut_string(&mut self) -> &mut String { &mut self.url.serialization } + fn finish(self) -> &'a mut ::Url { self.url } + type Finished = &'a mut ::Url; +} + +impl Serializer { + /// Create a new `application/x-www-form-urlencoded` serializer for the given target. + /// + /// If the target is non-empty, + /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax. + pub fn new(target: T) -> Self { + Self::for_suffix(target, 0) + } + /// Create a new `application/x-www-form-urlencoded` serializer - /// for the given range of the given string. + /// for a suffix of the given target. /// - /// If the range is non-empty, the corresponding slice of the string is assumed - /// to already be in `application/x-www-form-urlencoded` syntax. - pub fn new(string: &'a mut String, start_position: usize) -> Self { - &string[start_position..]; // Panic if out of bounds + /// If that suffix is non-empty, + /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax. + pub fn for_suffix(mut target: T, start_position: usize) -> Self { + &target.as_mut_string()[start_position..]; // Panic if out of bounds Serializer { - string: string, + target: Some(target), start_position: start_position, encoding: EncodingOverride::utf8(), } } /// Remove any existing name/value pair. + /// + /// Panics if called after `.finish()`. pub fn clear(&mut self) -> &mut Self { - self.string.truncate(self.start_position); + string(&mut self.target).truncate(self.start_position); self } @@ -220,18 +264,11 @@ impl<'a> Serializer<'a> { self } - fn append_separator_if_needed(&mut self) { - if self.string.len() > self.start_position { - self.string.push('&') - } - } - /// Serialize and append a name/value pair. + /// + /// Panics if called after `.finish()`. pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self { - self.append_separator_if_needed(); - self.string.extend(byte_serialize(&self.encoding.encode(name.into()))); - self.string.push('='); - self.string.extend(byte_serialize(&self.encoding.encode(value.into()))); + append_pair(string(&mut self.target), self.start_position, self.encoding, name, value); self } @@ -240,11 +277,16 @@ impl<'a> Serializer<'a> { /// This simply calls `append_pair` repeatedly. /// This can be more convenient, so the user doesn’t need to introduce a block /// to limit the scope of `Serializer`’s borrow of its string. + /// + /// Panics if called after `.finish()`. pub fn append_pairs(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - for pair in iter { - let &(ref k, ref v) = pair.borrow(); - self.append_pair(k.as_ref(), v.as_ref()); + { + let string = string(&mut self.target); + for pair in iter { + let &(ref k, ref v) = pair.borrow(); + append_pair(string, self.start_position, self.encoding, k.as_ref(), v.as_ref()); + } } self } @@ -252,11 +294,50 @@ impl<'a> Serializer<'a> { /// Add a name/value pair whose name is `_charset_` /// and whose value is the character encoding’s name. /// (See the `encoding_override()` method.) + /// + /// Panics if called after `.finish()`. #[cfg(feature = "query_encoding")] pub fn append_charset(&mut self) -> &mut Self { - self.append_separator_if_needed(); - self.string.push_str("_charset_="); - self.string.push_str(self.encoding.name()); + { + let string = string(&mut self.target); + append_separator_if_needed(string, self.start_position); + string.push_str("_charset_="); + string.push_str(self.encoding.name()); + } self } + + /// If this serializer was constructed with a string, take and return that string. + /// + /// ```rust + /// use url::form_urlencoded; + /// let encoded: String = form_urlencoded::Serializer::new(String::new()) + /// .append_pair("foo", "bar & baz") + /// .append_pair("saison", "Été+hiver") + /// .finish(); + /// assert_eq!(encoded, "foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver"); + /// ``` + /// + /// Panics if called more than once. + pub fn finish(&mut self) -> T::Finished { + self.target.take().expect("url::form_urlencoded::Serializer double finish").finish() + } +} + +fn append_separator_if_needed(string: &mut String, start_position: usize) { + if string.len() > start_position { + string.push('&') + } +} + +fn string(target: &mut Option) -> &mut String { + target.as_mut().expect("url::form_urlencoded::Serializer finished").as_mut_string() +} + +fn append_pair(string: &mut String, start_position: usize, encoding: EncodingOverride, + name: &str, value: &str) { + append_separator_if_needed(string, start_position); + string.extend(byte_serialize(&encoding.encode(name.into()))); + string.push('='); + string.extend(byte_serialize(&encoding.encode(value.into()))); } diff --git a/src/lib.rs b/src/lib.rs index 0d2e1550..df4822b4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -492,76 +492,103 @@ impl Url { } } + fn take_fragment(&mut self) -> Option { + self.fragment_start.take().map(|start| { + debug_assert!(self.byte_at(start) == b'#'); + let fragment = self.slice(start + 1..).to_owned(); + self.serialization.truncate(start as usize); + fragment + }) + } + + fn restore_already_parsed_fragment(&mut self, fragment: Option) { + if let Some(ref fragment) = fragment { + assert!(self.fragment_start.is_none()); + self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('#'); + self.serialization.push_str(fragment); + } + } + /// Change this URL’s query string. pub fn set_query(&mut self, query: Option<&str>) { - self.set_query_internal(|url| { - // Remove any previous query - if let Some(start) = url.query_start.take() { - debug_assert!(url.byte_at(start) == b'?'); - url.serialization.truncate(start as usize); - } - // Write the new query, if any - if let Some(input) = query { - url.query_start = Some(to_u32(url.serialization.len()).unwrap()); - url.serialization.push('?'); - let scheme_end = url.scheme_end; - url.mutate(|parser| parser.parse_query(scheme_end, input)); - } - }) + let fragment = self.take_fragment(); + + // Remove any previous query + if let Some(start) = self.query_start.take() { + debug_assert!(self.byte_at(start) == b'?'); + self.serialization.truncate(start as usize); + } + // Write the new query, if any + if let Some(input) = query { + self.query_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('?'); + let scheme_end = self.scheme_end; + self.mutate(|parser| parser.parse_query(scheme_end, input)); + } + + self.restore_already_parsed_fragment(fragment); } - /// Change this URL’s query string, viewed as a sequence of name/value pairs + /// Manipulate this URL’s query string, viewed as a sequence of name/value pairs /// in `application/x-www-form-urlencoded` syntax. /// - /// Example: + /// The return value has a method-chaining API: /// /// ```rust /// # use url::Url; - /// let mut url = Url::parse("https://example.net?...#nav").unwrap(); - /// assert_eq!(url.query(), Some("...")); - /// url.mutate_query_pairs(|query| { - /// query.clear(); - /// query.append_pair("foo", "bar & baz"); - /// }); - /// assert_eq!(url.query(), Some("foo=bar+%26+baz")); - /// assert_eq!(url.as_str(), "https://example.net/?foo=bar+%26+baz#nav"); - /// url.mutate_query_pairs(|query| { - /// query.append_pair("saison", "Été+hiver"); - /// }); - /// assert_eq!(url.query(), Some("foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver")); + /// let mut url = Url::parse("https://example.net?lang=fr#nav").unwrap(); + /// assert_eq!(url.query(), Some("lang=fr")); + /// + /// url.mutate_query_pairs().append_pair("foo", "bar"); + /// assert_eq!(url.query(), Some("lang=fr&foo=bar")); + /// assert_eq!(url.as_str(), "https://example.net/?lang=fr&foo=bar#nav"); + /// + /// url.mutate_query_pairs() + /// .clear() + /// .append_pair("foo", "bar & baz") + /// .append_pair("saisons", "Été+hiver"); + /// assert_eq!(url.query(), Some("foo=bar+%26+baz&saisons=%C3%89t%C3%A9%2Bhiver")); + /// assert_eq!(url.as_str(), + /// "https://example.net/?foo=bar+%26+baz&saisons=%C3%89t%C3%A9%2Bhiver#nav"); /// ``` - pub fn mutate_query_pairs(&mut self, f: F) { - self.set_query_internal(|url| { - let query_start; - if let Some(start) = url.query_start { - debug_assert!(url.byte_at(start) == b'?'); - query_start = start as usize; - } else { - query_start = url.serialization.len(); - url.query_start = Some(to_u32(query_start).unwrap()); - url.serialization.push('?'); - } - let query_start = query_start + "?".len(); - f(&mut form_urlencoded::Serializer::new(&mut url.serialization, query_start)) - }) + /// + /// Note: `url.mutate_query_pairs().clear();` is equivalent to `url.set_query(Some(""))`, + /// not `url.set_query(None)`. + /// + /// The state of `Url` is unspecified if this return value is leaked without being dropped. + pub fn mutate_query_pairs(&mut self) -> form_urlencoded::Serializer { + let fragment = self.take_fragment(); + + let query_start; + if let Some(start) = self.query_start { + debug_assert!(self.byte_at(start) == b'?'); + query_start = start as usize; + } else { + query_start = self.serialization.len(); + self.query_start = Some(to_u32(query_start).unwrap()); + self.serialization.push('?'); + } + + let query = UrlQuery { url: self, fragment: fragment }; + form_urlencoded::Serializer::for_suffix(query, query_start + "?".len()) } +} - fn set_query_internal(&mut self, f: F) { - // Stash any fragment - let fragment = self.fragment_start.map(|start| { - let f = self.slice(start..).to_owned(); - self.serialization.truncate(start as usize); - f - }); - f(self); - // Restore the fragment, if any - if let Some(ref fragment) = fragment { - self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); - debug_assert!(fragment.starts_with('#')); - self.serialization.push_str(fragment) // It’s already been through the parser - } + +/// Implementation detail of `Url::mutate_query_pairs`. Typically not used directly. +pub struct UrlQuery<'a> { + url: &'a mut Url, + fragment: Option, +} + +impl<'a> Drop for UrlQuery<'a> { + fn drop(&mut self) { + self.url.restore_already_parsed_fragment(self.fragment.take()) } +} +impl Url { /// Change this URL’s path. pub fn set_path(&mut self, path: &str) { let (old_after_path_pos, after_path) = match (self.query_start, self.fragment_start) { diff --git a/tests/tests.rs b/tests/tests.rs index 1024f4b9..0ce0d1b8 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -217,19 +217,17 @@ fn test_form_urlencoded() { ("bar".into(), "".into()), ("foo".into(), "#".into()) ]; - let mut encoded = String::new(); - form_urlencoded::Serializer::new(&mut encoded, 0).append_pairs(pairs); + let encoded = form_urlencoded::Serializer::new(String::new()).append_pairs(pairs).finish(); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); assert_eq!(form_urlencoded::parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); } #[test] fn test_form_serialize() { - let mut encoded = String::new(); - form_urlencoded::Serializer::new(&mut encoded, 0).append_pairs(&[ - ("foo", "é&"), - ("bar", ""), - ("foo", "#") - ]); + let encoded = form_urlencoded::Serializer::new(String::new()) + .append_pair("foo", "é&") + .append_pair("bar", "") + .append_pair("foo", "#") + .finish(); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); } From dd2d1ea160db3edbfc9a10c56e402d3554608c97 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Sun, 3 Apr 2016 13:06:24 +0200 Subject: [PATCH 63/89] Rename append_pairs to append_pair_iter --- src/form_urlencoded.rs | 2 +- tests/tests.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index af3ce787..9dfd070c 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -279,7 +279,7 @@ impl Serializer { /// to limit the scope of `Serializer`’s borrow of its string. /// /// Panics if called after `.finish()`. - pub fn append_pairs(&mut self, iter: I) -> &mut Self + pub fn append_pair_iter(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { { let string = string(&mut self.target); diff --git a/tests/tests.rs b/tests/tests.rs index 0ce0d1b8..bfcb9590 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -217,7 +217,7 @@ fn test_form_urlencoded() { ("bar".into(), "".into()), ("foo".into(), "#".into()) ]; - let encoded = form_urlencoded::Serializer::new(String::new()).append_pairs(pairs).finish(); + let encoded = form_urlencoded::Serializer::new(String::new()).append_pair_iter(pairs).finish(); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); assert_eq!(form_urlencoded::parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); } From 224aee62d71c78d215eaa9678b30f2d4d6bab096 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 8 Apr 2016 19:24:28 +0200 Subject: [PATCH 64/89] Add form_urlencoded::Parse::into_owned --- src/form_urlencoded.rs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 9dfd070c..c9f4156a 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -16,6 +16,7 @@ use encoding::EncodingOverride; use percent_encoding::{percent_encode_byte, percent_decode}; use std::borrow::{Borrow, Cow}; +use std::iter; use std::str; @@ -79,6 +80,7 @@ pub fn parse_with_encoding<'a>(input: &'a [u8], } /// The return type of `parse()`. +#[derive(Copy, Clone)] pub struct Parse<'a> { input: &'a [u8], encoding: EncodingOverride, @@ -134,6 +136,16 @@ fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { } } +impl<'a> Parse<'a> { + /// Return a new iterator that yields pairs of `String` instead of pair of `Cow`. + pub fn into_owned(self) -> iter::Map, fn((Cow, Cow)) -> (String, String)> { + fn into_owned((k, v): (Cow, Cow)) -> (String, String) { + (k.into_owned(), v.into_owned()) + } + self.map(into_owned) + } +} + /// The [`application/x-www-form-urlencoded` byte serializer]( /// https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer). /// From f4bd6e5b65d9274ec55ddb6538d8315c3d41929b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 4 Apr 2016 18:29:46 +0200 Subject: [PATCH 65/89] Check invariants during tests. --- src/host.rs | 2 +- src/lib.rs | 90 ++++++++++++++++++++++++++++++++++++++++-- src/slicing.rs | 10 ++--- tests/urltestdata.json | 16 ++++++++ tests/wpt.rs | 2 + 5 files changed, 110 insertions(+), 10 deletions(-) diff --git a/src/host.rs b/src/host.rs index f1889476..99f8cae4 100644 --- a/src/host.rs +++ b/src/host.rs @@ -15,7 +15,7 @@ use parser::{ParseResult, ParseError}; use percent_encoding::percent_decode; use idna; -#[derive(Copy, Clone, Debug)] +#[derive(Copy, Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub enum HostInternal { None, diff --git a/src/lib.rs b/src/lib.rs index df4822b4..2f1bf5ca 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -260,6 +260,92 @@ impl Url { self.serialization } + /// For internal testing. + /// + /// Methods of the `Url` struct assume a number of invariants. + /// This checks each of these invariants and panic if one is not met. + /// This is for testing rust-url itself. + pub fn assert_invariants(&self) { + macro_rules! assert { + ($x: expr) => { + if !$x { + panic!("!( {} ) for URL {:?}", stringify!($x), self.serialization) + } + } + } + + macro_rules! assert_eq { + ($a: expr, $b: expr) => { + { + let a = $a; + let b = $b; + if a != b { + panic!("{:?} != {:?} ({} != {}) for URL {:?}", + a, b, stringify!($a), stringify!($b), self.serialization) + } + } + } + } + + assert!(self.scheme_end >= 1); + assert!(matches!(self.byte_at(0), b'a'...b'z' | b'A'...b'Z')); + assert!(self.slice(1..self.scheme_end).chars() + .all(|c| matches!(c, 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.'))); + assert_eq!(self.byte_at(self.scheme_end), b':'); + + if self.slice(self.scheme_end + 1 ..).starts_with("//") { + // URL with authority + match self.byte_at(self.username_end) { + b':' => { + assert!(self.host_start >= self.username_end + 2); + assert_eq!(self.byte_at(self.host_start - 1), b'@'); + } + b'@' => assert!(self.host_start == self.username_end + 1), + _ => assert_eq!(self.username_end, self.scheme_end + 3), + } + assert!(self.host_start >= self.username_end); + assert!(self.host_end >= self.host_start); + let host_str = self.slice(self.host_start..self.host_end); + match self.host { + HostInternal::None => assert_eq!(host_str, ""), + HostInternal::Ipv4(address) => assert_eq!(host_str, address.to_string()), + HostInternal::Ipv6(address) => assert_eq!(host_str, format!("[{}]", address)), + HostInternal::Domain => { + if SchemeType::from(self.scheme()).is_special() { + assert!(!host_str.is_empty()) + } + } + } + if self.path_start == self.host_end { + assert_eq!(self.port, None); + } else { + assert_eq!(self.byte_at(self.host_end), b':'); + let port_str = self.slice(self.host_end + 1..self.path_start); + assert_eq!(self.port, Some(port_str.parse::().unwrap())); + } + assert_eq!(self.byte_at(self.path_start), b'/'); + } else { + // Anarchist URL (no authority) + assert_eq!(self.username_end, self.scheme_end + 1); + assert_eq!(self.host_start, self.scheme_end + 1); + assert_eq!(self.host_end, self.scheme_end + 1); + assert_eq!(self.host, HostInternal::None); + assert_eq!(self.port, None); + assert_eq!(self.path_start, self.scheme_end + 1); + } + if let Some(start) = self.query_start { + assert!(start > self.path_start); + assert_eq!(self.byte_at(start), b'?'); + } + if let Some(start) = self.fragment_start { + assert!(start > self.path_start); + assert_eq!(self.byte_at(start), b'#'); + } + if let (Some(query_start), Some(fragment_start)) = (self.query_start, self.fragment_start) { + assert!(fragment_start > query_start); + } + } + /// Return the scheme of this URL, lower-cased, as an ASCII string without the ':' delimiter. #[inline] pub fn scheme(&self) -> &str { @@ -297,9 +383,7 @@ impl Url { pub fn password(&self) -> Option<&str> { // This ':' is not the one marking a port number since a host can not be empty. // (Except for file: URLs, which do not have port numbers.) - if self.byte_at(self.username_end) == b':' { - debug_assert!(self.has_host()); - debug_assert!(self.host_start < self.host_end); + if self.has_host() && self.byte_at(self.username_end) == b':' { debug_assert!(self.byte_at(self.host_start - 1) == b'@'); Some(self.slice(self.username_end + 1..self.host_start - 1)) } else { diff --git a/src/slicing.rs b/src/slicing.rs index 94d6c38d..2c8d75e6 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -115,18 +115,16 @@ impl Url { Position::AfterUsername => self.username_end as usize, - Position::BeforePassword => if self.byte_at(self.username_end) == b':' { - debug_assert!(self.has_host()); - debug_assert!(self.host_start < self.host_end); + Position::BeforePassword => if self.has_host() && + self.byte_at(self.username_end) == b':' { self.username_end as usize + ":".len() } else { debug_assert!(self.username_end == self.host_start); self.username_end as usize }, - Position::AfterPassword => if self.byte_at(self.username_end) == b':' { - debug_assert!(self.has_host()); - debug_assert!(self.host_start < self.host_end); + Position::AfterPassword => if self.has_host() && + self.byte_at(self.username_end) == b':' { debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); self.host_start as usize - "@".len() } else { diff --git a/tests/urltestdata.json b/tests/urltestdata.json index 2c7d344f..bb95804f 100644 --- a/tests/urltestdata.json +++ b/tests/urltestdata.json @@ -4163,6 +4163,22 @@ "search": "", "hash": "" }, + "# unknown scheme with path looking like a password", + { + "input": "sc::a@example.net", + "base": "about:blank", + "href": "sc::a@example.net", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": ":a@example.net", + "search": "", + "hash": "" + }, "# tests from jsdom/whatwg-url designed for code coverage", { "input": "http://127.0.0.1:10100/relative_import.html", diff --git a/tests/wpt.rs b/tests/wpt.rs index a98fc607..3090beb9 100644 --- a/tests/wpt.rs +++ b/tests/wpt.rs @@ -28,6 +28,8 @@ fn run_one(input: String, base: String, expected: Result) { (Ok(_), Err(())) => panic!("Expected a parse error for URL {:?}", input), }; + url.assert_invariants(); + macro_rules! assert_eq { ($expected: expr, $got: expr) => { { From 0cd4d368055ab0c0f0cafe42998cb069d7f1c4d9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 8 Apr 2016 15:29:26 +0200 Subject: [PATCH 66/89] Rename test crates. --- Cargo.toml | 4 ++-- tests/{wpt.rs => data.rs} | 3 ++- tests/{tests.rs => unit.rs} | 2 ++ 3 files changed, 6 insertions(+), 3 deletions(-) rename tests/{wpt.rs => data.rs} (97%) rename tests/{tests.rs => unit.rs} (99%) diff --git a/Cargo.toml b/Cargo.toml index d396b717..6af1dbe3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,10 +12,10 @@ keywords = ["url", "parser"] license = "MIT/Apache-2.0" [[test]] -name = "tests" +name = "unit" [[test]] -name = "wpt" +name = "data" harness = false [lib] diff --git a/tests/wpt.rs b/tests/data.rs similarity index 97% rename from tests/wpt.rs rename to tests/data.rs index 3090beb9..0fa3d76e 100644 --- a/tests/wpt.rs +++ b/tests/data.rs @@ -6,7 +6,7 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Tests copied form https://github.com/w3c/web-platform-tests/blob/master/url/ +//! Data-driven tests extern crate rustc_serialize; extern crate test; @@ -79,6 +79,7 @@ struct TestCase { } fn main() { + // Copied form https://github.com/w3c/web-platform-tests/blob/master/url/ let json = Json::from_str(include_str!("urltestdata.json")) .expect("JSON parse error in urltestdata.json"); let tests = json.as_array().unwrap().iter().filter_map(|entry| { diff --git a/tests/tests.rs b/tests/unit.rs similarity index 99% rename from tests/tests.rs rename to tests/unit.rs index bfcb9590..2c458cae 100644 --- a/tests/tests.rs +++ b/tests/unit.rs @@ -6,6 +6,8 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. +//! Unit tests + extern crate url; use std::borrow::Cow; From e8df8a303f0e1af61cdff843c34ef9412cc5c732 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 8 Apr 2016 15:39:55 +0200 Subject: [PATCH 67/89] Prepare for more than one kind of data-driven test. --- tests/data.rs | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/tests/data.rs b/tests/data.rs index 0fa3d76e..8c4ebe76 100644 --- a/tests/data.rs +++ b/tests/data.rs @@ -16,7 +16,7 @@ use rustc_serialize::json::Json; use url::{Url, Position}; -fn run_one(input: String, base: String, expected: Result) { +fn run_parsing(input: String, base: String, expected: Result) { let base = match Url::parse(&base) { Ok(base) => base, Err(message) => panic!("Error parsing base {:?}: {}", base, message) @@ -64,7 +64,7 @@ fn trim(s: &str) -> &str { } } -struct TestCase { +struct ParsingTestCase { href: String, origin: Option, protocol: String, @@ -78,13 +78,13 @@ struct TestCase { hash: String, } -fn main() { +fn collect_parsing(add_test: &mut F) { // Copied form https://github.com/w3c/web-platform-tests/blob/master/url/ let json = Json::from_str(include_str!("urltestdata.json")) .expect("JSON parse error in urltestdata.json"); - let tests = json.as_array().unwrap().iter().filter_map(|entry| { + for entry in json.as_array().unwrap() { if entry.is_string() { - return None // ignore comments + continue // ignore comments } let string = |key| entry.find(key).unwrap().as_string().unwrap().to_owned(); let base = string("base"); @@ -92,7 +92,7 @@ fn main() { let expected = if entry.find("failure").is_some() { Err(()) } else { - Ok(TestCase { + Ok(ParsingTestCase { href: string("href"), origin: entry.find("origin").map(|j| j.as_string().unwrap().to_owned()), protocol: string("protocol"), @@ -106,14 +106,25 @@ fn main() { hash: string("hash"), }) }; - Some(test::TestDescAndFn { - desc: test::TestDesc { - name: test::DynTestName(format!("{:?} @ base {:?}", input, base)), - ignore: false, - should_panic: test::ShouldPanic::No, - }, - testfn: test::TestFn::dyn_test_fn(move || run_one(input, base, expected)), - }) - }).collect(); + add_test(format!("{:?} @ base {:?}", input, base), + test::TestFn::dyn_test_fn(move || run_parsing(input, base, expected))); + } +} + +fn main() { + let mut tests = Vec::new(); + { + let mut add_one = |name: String, run: test::TestFn| { + tests.push(test::TestDescAndFn { + desc: test::TestDesc { + name: test::DynTestName(name), + ignore: false, + should_panic: test::ShouldPanic::No, + }, + testfn: run, + }) + }; + collect_parsing(&mut add_one); + } test::test_main(&std::env::args().collect::>(), tests) } From 868511226901c2ee6716972555476be037da518f Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 8 Apr 2016 19:03:38 +0200 Subject: [PATCH 68/89] Bring back urlutils.rs / webidl.rs after all Rename it quirks.rs this time. It turns out tests need it. --- src/lib.rs | 69 +--------------- src/quirks.rs | 216 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 218 insertions(+), 67 deletions(-) create mode 100644 src/quirks.rs diff --git a/src/lib.rs b/src/lib.rs index 2f1bf5ca..af491b57 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -151,8 +151,9 @@ mod origin; mod parser; mod slicing; -pub mod percent_encoding; pub mod form_urlencoded; +pub mod percent_encoding; +pub mod quirks; /// A parsed URL record. #[derive(Clone)] @@ -1006,72 +1007,6 @@ impl Url { Ok(()) } - /// Setter for https://url.spec.whatwg.org/#dom-url-host - /// - /// Unless you need to be interoperable with web browsers, - /// use `set_host` and `set_port` instead. - pub fn quirky_set_host_and_port(&mut self, new_host: &str) -> Result<(), ()> { - if self.cannot_be_a_base() { - return Err(()) - } - let host; - let opt_port; - { - let scheme = self.scheme(); - let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); - match result { - Ok((h, remaining)) => { - host = h; - opt_port = if remaining.starts_with(':') { - Parser::parse_port(remaining, |_| (), || parser::default_port(scheme)) - .ok().map(|(port, _remaining)| port) - } else { - None - }; - } - Err(_) => return Err(()) - } - } - self.set_host_internal(host, opt_port); - Ok(()) - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-hostname - /// - /// Unless you need to be interoperable with web browsers, use `set_host` instead. - pub fn quirky_set_host(&mut self, new_hostname: &str) -> Result<(), ()> { - if self.cannot_be_a_base() { - return Err(()) - } - let result = Parser::parse_host(new_hostname, SchemeType::from(self.scheme()), |_| ()); - if let Ok((host, _remaining)) = result { - self.set_host_internal(host, None); - Ok(()) - } else { - Err(()) - } - } - - /// Setter for https://url.spec.whatwg.org/#dom-url-port - /// - /// Unless you need to be interoperable with web browsers, use `set_port` instead. - pub fn quirky_set_port(&mut self, new_port: &str) -> Result<(), ()> { - let result; - { - // has_host implies !cannot_be_a_base - let scheme = self.scheme(); - if !self.has_host() || scheme == "file" { - return Err(()) - } - result = Parser::parse_port(new_port, |_| (), || parser::default_port(scheme)) - } - if let Ok((new_port, _remaining)) = result { - self.set_port_internal(new_port); - Ok(()) - } else { - Err(()) - } - } /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// /// This returns `Err` if the given path is not absolute or, diff --git a/src/quirks.rs b/src/quirks.rs new file mode 100644 index 00000000..5ef2baa9 --- /dev/null +++ b/src/quirks.rs @@ -0,0 +1,216 @@ +// Copyright 2016 Simon Sapin. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Getters and setters for URL components implemented per https://url.spec.whatwg.org/#api +//! +//! Unless you need to be interoperable with web browsers, you probably don’t want to use this. + +use {Url, Position, Host, ParseError, idna}; +use parser::{Parser, SchemeType, default_port}; + +/// https://url.spec.whatwg.org/#dom-url-domaintoascii +pub fn domain_to_ascii(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(domain)) => domain, + _ => String::new(), + } +} + +/// https://url.spec.whatwg.org/#dom-url-domaintounicode +pub fn domain_to_unicode(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(ref domain)) => { + let (unicode, _errors) = idna::domain_to_unicode(domain); + unicode + } + _ => String::new(), + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-href +pub fn href(url: &Url) -> &str { + url.as_str() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-href +pub fn set_href(url: &mut Url, value: &str) -> Result<(), ParseError> { + *url = try!(Url::parse(value)); + Ok(()) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-origin +pub fn origin(url: &Url) -> String { + url.origin().unicode_serialization() +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-protocol +#[inline] +pub fn protocol(url: &Url) -> &str { + &url.as_str()[..url.scheme().len() + ":".len()] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-protocol +pub fn set_protocol(url: &mut Url, mut new_protocol: &str) -> Result<(), ()> { + // The scheme state in the spec ignores everything after the first `:`, + // but `set_scheme` errors if there is more. + if let Some(position) = new_protocol.find(':') { + new_protocol = &new_protocol[..position]; + } + url.set_scheme(new_protocol) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-username +#[inline] +pub fn username(url: &Url) -> &str { + url.username() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-username +pub fn set_username(url: &mut Url, new_username: &str) -> Result<(), ()> { + url.set_username(new_username) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-password +#[inline] +pub fn password(url: &Url) -> &str { + url.password().unwrap_or("") +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-password +pub fn set_password(url: &mut Url, new_password: &str) -> Result<(), ()> { + url.set_password(if new_password.is_empty() { None } else { Some(new_password) }) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-host +#[inline] +pub fn host(url: &Url) -> &str { + &url[Position::BeforeHost..Position::AfterPort] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-host +pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { + if url.cannot_be_a_base() { + return Err(()) + } + let host; + let opt_port; + { + let scheme = url.scheme(); + let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); + match result { + Ok((h, remaining)) => { + host = h; + opt_port = if remaining.starts_with(':') { + Parser::parse_port(remaining, |_| (), || default_port(scheme)) + .ok().map(|(port, _remaining)| port) + } else { + None + }; + } + Err(_) => return Err(()) + } + } + url.set_host_internal(host, opt_port); + Ok(()) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-hostname +#[inline] +pub fn hostname(url: &Url) -> &str { + url.host_str().unwrap_or("") +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-hostname +pub fn set_hostname(url: &mut Url, new_hostname: &str) -> Result<(), ()> { + if url.cannot_be_a_base() { + return Err(()) + } + let result = Parser::parse_host(new_hostname, SchemeType::from(url.scheme()), |_| ()); + if let Ok((host, _remaining)) = result { + url.set_host_internal(host, None); + Ok(()) + } else { + Err(()) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-port +#[inline] +pub fn port(url: &Url) -> &str { + &url[Position::BeforePort..Position::AfterPort] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-port +pub fn set_port(url: &mut Url, new_port: &str) -> Result<(), ()> { + let result; + { + // has_host implies !cannot_be_a_base + let scheme = url.scheme(); + if !url.has_host() || scheme == "file" { + return Err(()) + } + result = Parser::parse_port(new_port, |_| (), || default_port(scheme)) + } + if let Ok((new_port, _remaining)) = result { + url.set_port_internal(new_port); + Ok(()) + } else { + Err(()) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-pathname +#[inline] +pub fn pathname(url: &Url) -> &str { + url.path() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-pathname +pub fn set_pathname(url: &mut Url, new_pathname: &str) { + if !url.cannot_be_a_base() { + url.set_path(new_pathname) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-search +pub fn search(url: &Url) -> &str { + trim(&url[Position::AfterPath..Position::AfterQuery]) +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-search +pub fn set_search(url: &mut Url, new_search: &str) { + url.set_query(match new_search { + "" => None, + _ if new_search.starts_with('?') => Some(&new_search[1..]), + _ => Some(new_search), + }) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-hash +pub fn hash(url: &Url) -> &str { + trim(&url[Position::AfterQuery..]) +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-hash +pub fn set_hash(url: &mut Url, new_hash: &str) { + if url.scheme() != "javascript" { + url.set_fragment(match new_hash { + "" => None, + _ if new_hash.starts_with('#') => Some(&new_hash[1..]), + _ => Some(new_hash), + }) + } +} + +fn trim(s: &str) -> &str { + if s.len() == 1 { + "" + } else { + s + } +} From 92eb5a6b3b56a92d85ef12f084255a2421e5588b Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Fri, 8 Apr 2016 19:13:56 +0200 Subject: [PATCH 69/89] Test harness for setters. --- tests/data.rs | 139 ++++++++++++++++++++++++++++----------- tests/setters_tests.json | 21 ++++++ 2 files changed, 121 insertions(+), 39 deletions(-) create mode 100644 tests/setters_tests.json diff --git a/tests/data.rs b/tests/data.rs index 8c4ebe76..b0d23bb9 100644 --- a/tests/data.rs +++ b/tests/data.rs @@ -12,11 +12,11 @@ extern crate rustc_serialize; extern crate test; extern crate url; -use rustc_serialize::json::Json; -use url::{Url, Position}; +use rustc_serialize::json::{self, Json}; +use url::{Url, quirks}; -fn run_parsing(input: String, base: String, expected: Result) { +fn run_parsing(input: String, base: String, expected: Result) { let base = match Url::parse(&base) { Ok(base) => base, Err(message) => panic!("Error parsing base {:?}: {}", base, message) @@ -41,30 +41,24 @@ fn run_parsing(input: String, base: String, expected: Result { + { + $( + assert_eq!(expected.$attr, quirks::$attr(&url)); + )+; + } + } } - assert_eq!(expected.protocol, &url.as_str()[..url.scheme().len() + ":".len()]); - assert_eq!(expected.username, url.username()); - assert_eq!(expected.password, url.password().unwrap_or("")); - assert_eq!(expected.host, &url[Position::BeforeHost..Position::AfterPort]); - assert_eq!(expected.hostname, url.host_str().unwrap_or("")); - assert_eq!(expected.port, &url[Position::BeforePort..Position::AfterPort]); - assert_eq!(expected.pathname, url.path()); - assert_eq!(expected.search, trim(&url[Position::AfterPath..Position::AfterQuery])); - assert_eq!(expected.hash, trim(&url[Position::AfterQuery..])); -} -fn trim(s: &str) -> &str { - if s.len() == 1 { - "" - } else { - s + assert_attributes!(href protocol username password host hostname port pathname search hash); + + if let Some(expected_origin) = expected.origin { + assert_eq!(expected_origin, quirks::origin(&url)); } } -struct ParsingTestCase { +struct ExpectedAttributes { href: String, origin: Option, protocol: String, @@ -78,32 +72,56 @@ struct ParsingTestCase { hash: String, } +trait JsonExt { + fn take(&mut self, key: &str) -> Option; + fn object(self) -> json::Object; + fn string(self) -> String; + fn take_string(&mut self, key: &str) -> String; +} + +impl JsonExt for Json { + fn take(&mut self, key: &str) -> Option { + self.as_object_mut().unwrap().remove(key) + } + + fn object(self) -> json::Object { + if let Json::Object(o) = self { o } else { panic!("Not a Json::Object") } + } + + fn string(self) -> String { + if let Json::String(s) = self { s } else { panic!("Not a Json::String") } + } + + fn take_string(&mut self, key: &str) -> String { + self.take(key).unwrap().string() + } +} + fn collect_parsing(add_test: &mut F) { // Copied form https://github.com/w3c/web-platform-tests/blob/master/url/ - let json = Json::from_str(include_str!("urltestdata.json")) + let mut json = Json::from_str(include_str!("urltestdata.json")) .expect("JSON parse error in urltestdata.json"); - for entry in json.as_array().unwrap() { + for entry in json.as_array_mut().unwrap() { if entry.is_string() { continue // ignore comments } - let string = |key| entry.find(key).unwrap().as_string().unwrap().to_owned(); - let base = string("base"); - let input = string("input"); + let base = entry.take_string("base"); + let input = entry.take_string("input"); let expected = if entry.find("failure").is_some() { Err(()) } else { - Ok(ParsingTestCase { - href: string("href"), - origin: entry.find("origin").map(|j| j.as_string().unwrap().to_owned()), - protocol: string("protocol"), - username: string("username"), - password: string("password"), - host: string("host"), - hostname: string("hostname"), - port: string("port"), - pathname: string("pathname"), - search: string("search"), - hash: string("hash"), + Ok(ExpectedAttributes { + href: entry.take_string("href"), + origin: entry.take("origin").map(Json::string), + protocol: entry.take_string("protocol"), + username: entry.take_string("username"), + password: entry.take_string("password"), + host: entry.take_string("host"), + hostname: entry.take_string("hostname"), + port: entry.take_string("port"), + pathname: entry.take_string("pathname"), + search: entry.take_string("search"), + hash: entry.take_string("hash"), }) }; add_test(format!("{:?} @ base {:?}", input, base), @@ -111,6 +129,48 @@ fn collect_parsing(add_test: &mut F) { } } +fn collect_setters(add_test: &mut F) where F: FnMut(String, test::TestFn) { + let mut json = Json::from_str(include_str!("setters_tests.json")) + .expect("JSON parse error in setters_tests.json"); + + macro_rules! setter { + ($attr: expr, $setter: ident) => {{ + let mut tests = json.take($attr).unwrap(); + for mut test in tests.as_array_mut().unwrap().drain(..) { + let comment = test.take("comment").map(Json::string).unwrap_or(String::new()); + let href = test.take_string("href"); + let new_value = test.take_string("new_value"); + let name = format!("{:?}.{} = {:?} {}", href, $attr, new_value, comment); + let mut expected = test.take("expected").unwrap(); + add_test(name, test::TestFn::dyn_test_fn(move || { + let mut url = Url::parse(&href).unwrap(); + let _ = quirks::$setter(&mut url, &new_value); + assert_attributes!(url, expected, + href protocol username password host hostname port pathname search hash); + })) + } + }} + } + macro_rules! assert_attributes { + ($url: expr, $expected: expr, $($attr: ident)+) => { + $( + if let Some(value) = $expected.take(stringify!($attr)) { + assert_eq!(quirks::$attr(&$url), value.string()) + } + )+ + } + } + setter!("protocol", set_protocol); + setter!("username", set_username); + setter!("password", set_password); + setter!("hostname", set_hostname); + setter!("host", set_host); + setter!("port", set_port); + setter!("pathname", set_pathname); + setter!("search", set_search); + setter!("hash", set_hash); +} + fn main() { let mut tests = Vec::new(); { @@ -125,6 +185,7 @@ fn main() { }) }; collect_parsing(&mut add_one); + collect_setters(&mut add_one); } test::test_main(&std::env::args().collect::>(), tests) } diff --git a/tests/setters_tests.json b/tests/setters_tests.json new file mode 100644 index 00000000..eb621b51 --- /dev/null +++ b/tests/setters_tests.json @@ -0,0 +1,21 @@ +{ + "protocol": [ + { + "comment": "The empty scheme is not a valid scheme.", + "href": "http://example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "protocol": "http:" + } + } + ], + "username": [], + "password": [], + "host": [], + "hostname": [], + "port": [], + "pathname": [], + "search": [], + "hash": [] +} From 81779d5eada41719b2007281df06aca7204651a9 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 12 Apr 2016 15:33:39 +0200 Subject: [PATCH 70/89] Add protocol setter tests. --- src/quirks.rs | 3 +- tests/setters_tests.json | 134 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 133 insertions(+), 4 deletions(-) diff --git a/src/quirks.rs b/src/quirks.rs index 5ef2baa9..cbb728ba 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -8,7 +8,8 @@ //! Getters and setters for URL components implemented per https://url.spec.whatwg.org/#api //! -//! Unless you need to be interoperable with web browsers, you probably don’t want to use this. +//! Unless you need to be interoperable with web browsers, +//! you probably want to use `Url` method instead. use {Url, Position, Host, ParseError, idna}; use parser::{Parser, SchemeType, default_port}; diff --git a/tests/setters_tests.json b/tests/setters_tests.json index eb621b51..83577a8c 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -1,13 +1,141 @@ { + "comment": [ + "## Tests for setters of https://url.spec.whatwg.org/#urlutils-members", + "", + "This file contains a JSON object.", + "Other than 'comment', each key is an attribute of the `URL` interface", + "defined in WHATWG’s URL Standard.", + "The values are arrays of test case objects for that attribute.", + "", + "To run a test case for the attribute `attr`:", + "", + "* Create a new `URL` object with the value for the 'href' key", + " the constructor single parameter. (Without a base URL.)", + " This must not throw.", + "* Set the attribute `attr` to (invoke its setter with)", + " with the value of for 'new_value' key.", + "* The value for the 'expected' key is another object.", + " For each `key` / `value` pair of that object,", + " get the attribute `key` (invoke its getter).", + " The returned string must be equal to `value`.", + "", + "Note: the 'href' setter is already covered by urltestdata.json." + ], "protocol": [ { - "comment": "The empty scheme is not a valid scheme.", - "href": "http://example.net", + "comment": "The empty string is not a valid scheme. Setter leaves the URL unchanged.", + "href": "a://example.net", "new_value": "", "expected": { - "href": "http://example.net/", + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "href": "a://example.net", + "new_value": "b", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Upper-case ASCII is lower-cased", + "href": "a://example.net", + "new_value": "B", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Non-ASCII is rejected", + "href": "a://example.net", + "new_value": "é", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "No leading digit", + "href": "a://example.net", + "new_value": "0b", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "No leading punctuation", + "href": "a://example.net", + "new_value": "+b", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "href": "a://example.net", + "new_value": "bC0+-.", + "expected": { + "href": "bc0+-.://example.net/", + "protocol": "bc0+-.:" + } + }, + { + "comment": "Non-ASCII is rejected", + "href": "a://example.net", + "new_value": "bé", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "Spec deviation: from special scheme to not is not problematic. https://github.com/whatwg/url/issues/104", + "href": "http://example.net", + "new_value": "b", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Cannot-be-a-base URL doesn’t have a host, but URL in a special scheme must.", + "href": "mailto:me@example.net", + "new_value": "http", + "expected": { + "href": "mailto:me@example.net", + "protocol": "mailto:" + } + }, + { + "comment": "Spec deviation: from non-special scheme with a host to special is not problematic. https://github.com/whatwg/url/issues/104", + "href": "ssh://me@example.net", + "new_value": "http", + "expected": { + "href": "http://me@example.net/", "protocol": "http:" } + }, + { + "comment": "Stuff after the first ':' is ignored", + "href": "http://example.net", + "new_value": "https:foo : bar", + "expected": { + "href": "https://example.net/", + "protocol": "https:" + } + }, + { + "comment": "Stuff after the first ':' is ignored", + "href": "data:text/html,

Test", + "new_value": "view-source+data:foo : bar", + "expected": { + "href": "view-source+data:text/html,

Test", + "protocol": "view-source+data:" + } } ], "username": [], From eb4d9b13863db943891c2ea26165314d1c2593ae Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 12 Apr 2016 16:55:32 +0200 Subject: [PATCH 71/89] Username setter tests and fixes. --- src/lib.rs | 51 ++++++++++++++++++++--------- src/slicing.rs | 6 ++-- tests/data.rs | 2 ++ tests/setters_tests.json | 71 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 111 insertions(+), 19 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index af491b57..b44fd53c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -353,9 +353,13 @@ impl Url { self.slice(..self.scheme_end) } - /// Return whether the URL has a host. + /// Return whether the URL has an 'authority', + /// which can contain a username, password, host, and port number. + /// + /// URLs that do *not* are either path-only like `unix:/run/foo.socket` + /// or cannot-be-a-base like `data:text/plain,Stuff`. #[inline] - pub fn has_host(&self) -> bool { + pub fn has_authority(&self) -> bool { debug_assert!(self.byte_at(self.scheme_end) == b':'); self.slice(self.scheme_end + 1 ..).starts_with("//") } @@ -373,7 +377,7 @@ impl Url { /// Return the username for this URL (typically the empty string) /// as a percent-encoded ASCII string. pub fn username(&self) -> &str { - if self.has_host() { + if self.has_authority() { self.slice(self.scheme_end + 3..self.username_end) } else { "" @@ -384,7 +388,7 @@ impl Url { pub fn password(&self) -> Option<&str> { // This ':' is not the one marking a port number since a host can not be empty. // (Except for file: URLs, which do not have port numbers.) - if self.has_host() && self.byte_at(self.username_end) == b':' { + if self.has_authority() && self.byte_at(self.username_end) == b':' { debug_assert!(self.byte_at(self.host_start - 1) == b'@'); Some(self.slice(self.username_end + 1..self.host_start - 1)) } else { @@ -392,6 +396,11 @@ impl Url { } } + /// Equivalent to `url.host().is_some()`. + pub fn has_host(&self) -> bool { + !matches!(self.host, HostInternal::None) + } + /// Return the string representation of the host (domain or IP address) for this URL, if any. /// /// Non-ASCII domains are punycode-encoded per IDNA. @@ -946,6 +955,7 @@ impl Url { return Err(()) } let username_start = self.scheme_end + 3; + debug_assert!(self.slice(self.scheme_end..username_start) == "://"); if self.slice(username_start..self.username_end) == username { return Ok(()) } @@ -953,24 +963,35 @@ impl Url { self.serialization.truncate(username_start as usize); self.serialization.extend(utf8_percent_encode(username, USERINFO_ENCODE_SET)); - let old_username_end = self.username_end; - let new_username_end = to_u32(self.serialization.len()).unwrap(); + let mut removed_bytes = self.username_end; + self.username_end = to_u32(self.serialization.len()).unwrap(); + let mut added_bytes = self.username_end; + + let new_username_is_empty = self.username_end == username_start; + match (new_username_is_empty, after_username.chars().next()) { + (true, Some('@')) => { + removed_bytes += 1; + self.serialization.push_str(&after_username[1..]); + } + (false, Some('@')) | (_, Some(':')) | (true, _) => { + self.serialization.push_str(&after_username); + } + (false, _) => { + added_bytes += 1; + self.serialization.push('@'); + self.serialization.push_str(&after_username); + } + } + let adjust = |index: &mut u32| { - *index -= old_username_end; - *index += new_username_end; + *index -= removed_bytes; + *index += added_bytes; }; - - self.username_end = new_username_end; adjust(&mut self.host_start); adjust(&mut self.host_end); adjust(&mut self.path_start); if let Some(ref mut index) = self.query_start { adjust(index) } if let Some(ref mut index) = self.fragment_start { adjust(index) } - - if !after_username.starts_with(|c| matches!(c, '@' | ':')) { - self.serialization.push('@'); - } - self.serialization.push_str(&after_username); Ok(()) } diff --git a/src/slicing.rs b/src/slicing.rs index 2c8d75e6..5a303e4b 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -105,7 +105,7 @@ impl Url { Position::AfterScheme => self.scheme_end as usize, - Position::BeforeUsername => if self.has_host() { + Position::BeforeUsername => if self.has_authority() { self.scheme_end as usize + "://".len() } else { debug_assert!(self.byte_at(self.scheme_end) == b':'); @@ -115,7 +115,7 @@ impl Url { Position::AfterUsername => self.username_end as usize, - Position::BeforePassword => if self.has_host() && + Position::BeforePassword => if self.has_authority() && self.byte_at(self.username_end) == b':' { self.username_end as usize + ":".len() } else { @@ -123,7 +123,7 @@ impl Url { self.username_end as usize }, - Position::AfterPassword => if self.has_host() && + Position::AfterPassword => if self.has_authority() && self.byte_at(self.username_end) == b':' { debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); self.host_start as usize - "@".len() diff --git a/tests/data.rs b/tests/data.rs index b0d23bb9..61c893f1 100644 --- a/tests/data.rs +++ b/tests/data.rs @@ -144,9 +144,11 @@ fn collect_setters(add_test: &mut F) where F: FnMut(String, test::TestFn) { let mut expected = test.take("expected").unwrap(); add_test(name, test::TestFn::dyn_test_fn(move || { let mut url = Url::parse(&href).unwrap(); + url.assert_invariants(); let _ = quirks::$setter(&mut url, &new_value); assert_attributes!(url, expected, href protocol username password host hostname port pathname search hash); + url.assert_invariants(); })) } }} diff --git a/tests/setters_tests.json b/tests/setters_tests.json index 83577a8c..cc75da9c 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -138,7 +138,76 @@ } } ], - "username": [], + "username": [ + { + "comment": "No host means no username", + "href": "file:///home/you/index.html", + "new_value": "me", + "expected": { + "href": "file:///home/you/index.html", + "username": "" + } + }, + { + "comment": "Cannot-be-a-base means no username", + "href": "mailto:you@example.net", + "new_value": "me", + "expected": { + "href": "mailto:you@example.net", + "username": "" + } + }, + { + "href": "http://example.net", + "new_value": "me", + "expected": { + "href": "http://me@example.net/", + "username": "me" + } + }, + { + "href": "http://:secret@example.net", + "new_value": "me", + "expected": { + "href": "http://me:secret@example.net/", + "username": "me" + } + }, + { + "href": "http://me@example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "username": "" + } + }, + { + "href": "http://me:secret@example.net", + "new_value": "", + "expected": { + "href": "http://:secret@example.net/", + "username": "" + } + }, + { + "comment": "UTF-8 percent encoding with the userinfo encode set.", + "href": "http://example.net", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "http://%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9@example.net/", + "username": "%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is.", + "href": "http://example.net", + "new_value": "%c3%89t%c3%a9", + "expected": { + "href": "http://%c3%89t%c3%a9@example.net/", + "username": "%c3%89t%c3%a9" + } + } + ], "password": [], "host": [], "hostname": [], From 777413d51ac167754b05b0d041195f9ab556cdae Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 12 Apr 2016 17:14:06 +0200 Subject: [PATCH 72/89] Password setter tests and fixes --- src/lib.rs | 1 + tests/setters_tests.json | 89 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index b44fd53c..b1cfc86c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -941,6 +941,7 @@ impl Url { let offset = end - start; self.host_start -= offset; self.host_end -= offset; + self.path_start -= offset; if let Some(ref mut index) = self.query_start { *index -= offset } if let Some(ref mut index) = self.fragment_start { *index -= offset } } diff --git a/tests/setters_tests.json b/tests/setters_tests.json index cc75da9c..56f78c20 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -148,6 +148,15 @@ "username": "" } }, + { + "comment": "No host means no username", + "href": "unix:/run/foo.socket", + "new_value": "me", + "expected": { + "href": "unix:/run/foo.socket", + "username": "" + } + }, { "comment": "Cannot-be-a-base means no username", "href": "mailto:you@example.net", @@ -208,7 +217,85 @@ } } ], - "password": [], + "password": [ + { + "comment": "No host means no password", + "href": "file:///home/me/index.html", + "new_value": "secret", + "expected": { + "href": "file:///home/me/index.html", + "password": "" + } + }, + { + "comment": "No host means no password", + "href": "unix:/run/foo.socket", + "new_value": "secret", + "expected": { + "href": "unix:/run/foo.socket", + "password": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "mailto:me@example.net", + "new_value": "secret", + "expected": { + "href": "mailto:me@example.net", + "password": "" + } + }, + { + "href": "http://example.net", + "new_value": "secret", + "expected": { + "href": "http://:secret@example.net/", + "password": "secret" + } + }, + { + "href": "http://me@example.net", + "new_value": "secret", + "expected": { + "href": "http://me:secret@example.net/", + "password": "secret" + } + }, + { + "href": "http://:secret@example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "password": "" + } + }, + { + "href": "http://me:secret@example.net", + "new_value": "", + "expected": { + "href": "http://me@example.net/", + "password": "" + } + }, + { + "comment": "UTF-8 percent encoding with the userinfo encode set.", + "href": "http://example.net", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "http://:%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9@example.net/", + "password": "%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is.", + "href": "http://example.net", + "new_value": "%c3%89t%c3%a9", + "expected": { + "href": "http://:%c3%89t%c3%a9@example.net/", + "password": "%c3%89t%c3%a9" + } + } + ], "host": [], "hostname": [], "port": [], From 42e608e2361da02f39a60f8f3537e0e3d108c514 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 15:16:00 +0200 Subject: [PATCH 73/89] host setter tests and fixes --- src/parser.rs | 27 ++-- src/quirks.rs | 7 +- tests/setters_tests.json | 318 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 337 insertions(+), 15 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 8ecc790a..77bcc6e0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -108,7 +108,7 @@ pub struct Parser<'a> { pub context: Context, } -#[derive(PartialEq, Eq)] +#[derive(PartialEq, Eq, Copy, Clone)] pub enum Context { UrlParser, Setter, @@ -601,7 +601,7 @@ impl<'a> Parser<'a> { Ok((username_end, remaining)) } - pub fn parse_host_and_port<'i>(&mut self, input: &'i str, + fn parse_host_and_port<'i>(&mut self, input: &'i str, scheme_end: u32, scheme_type: SchemeType) -> ParseResult<(u32, HostInternal, Option, &'i str)> { let (host, remaining) = try!( @@ -611,7 +611,7 @@ impl<'a> Parser<'a> { let (port, remaining) = if remaining.starts_with(":") { let syntax_violation = |message| self.syntax_violation(message); let scheme = || default_port(&self.serialization[..scheme_end as usize]); - try!(Parser::parse_port(&remaining[1..], syntax_violation, scheme)) + try!(Parser::parse_port(&remaining[1..], syntax_violation, scheme, self.context)) } else { (None, remaining) }; @@ -705,10 +705,11 @@ impl<'a> Parser<'a> { Ok((true, host, &input[end..])) } - pub fn parse_port<'i, V, P>(input: &'i str, syntax_violation: V, default_port: P) + pub fn parse_port<'i, V, P>(input: &'i str, syntax_violation: V, default_port: P, + context: Context) -> ParseResult<(Option, &'i str)> where V: Fn(&'static str), P: Fn() -> Option { - let mut port = 0; + let mut port: u32 = 0; let mut has_any_digit = false; let mut end = input.len(); for (i, c) in input.char_indices() { @@ -720,13 +721,17 @@ impl<'a> Parser<'a> { has_any_digit = true; } else { match c { - '/' | '\\' | '?' | '#' => { - end = i; - break - }, - '\t' | '\n' | '\r' => syntax_violation("invalid character"), - _ => return Err(ParseError::InvalidPort) + '\t' | '\n' | '\r' => { + syntax_violation("invalid character"); + continue + } + '/' | '\\' | '?' | '#' => {} + _ => if context == Context::UrlParser { + return Err(ParseError::InvalidPort) + } } + end = i; + break } } let mut opt_port = Some(port as u16); diff --git a/src/quirks.rs b/src/quirks.rs index cbb728ba..55bef503 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -12,7 +12,7 @@ //! you probably want to use `Url` method instead. use {Url, Position, Host, ParseError, idna}; -use parser::{Parser, SchemeType, default_port}; +use parser::{Parser, SchemeType, default_port, Context}; /// https://url.spec.whatwg.org/#dom-url-domaintoascii pub fn domain_to_ascii(domain: &str) -> String { @@ -107,7 +107,8 @@ pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { Ok((h, remaining)) => { host = h; opt_port = if remaining.starts_with(':') { - Parser::parse_port(remaining, |_| (), || default_port(scheme)) + Parser::parse_port(&remaining[1..], |_| (), || default_port(scheme), + Context::Setter) .ok().map(|(port, _remaining)| port) } else { None @@ -155,7 +156,7 @@ pub fn set_port(url: &mut Url, new_port: &str) -> Result<(), ()> { if !url.has_host() || scheme == "file" { return Err(()) } - result = Parser::parse_port(new_port, |_| (), || default_port(scheme)) + result = Parser::parse_port(new_port, |_| (), || default_port(scheme), Context::Setter) } if let Ok((new_port, _remaining)) = result { url.set_port_internal(new_port); diff --git a/tests/setters_tests.json b/tests/setters_tests.json index 56f78c20..7e708487 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -83,6 +83,15 @@ "protocol": "bc0+-.:" } }, + { + "comment": "Only some punctuation is acceptable", + "href": "a://example.net", + "new_value": "b,c", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, { "comment": "Non-ASCII is rejected", "href": "a://example.net", @@ -296,7 +305,314 @@ } } ], - "host": [], + "host": [ + { + "comment": "Cannot-be-a-base means no host", + "href": "mailto:me@example.net", + "new_value": "example.com", + "expected": { + "href": "mailto:me@example.net", + "host": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "data:text/plain,Stuff", + "new_value": "example.net", + "expected": { + "href": "data:text/plain,Stuff", + "host": "" + } + }, + { + "href": "http://example.net", + "new_value": "example.com:8080", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port number is unchanged if not specified in the new value", + "href": "http://example.net:8080", + "new_value": "example.com", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port number is removed if empty in the new value: https://github.com/whatwg/url/pull/113", + "href": "http://example.net:8080", + "new_value": "example.com:", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Port number can be removed", + "href": "http://example.net:8080", + "new_value": "example.com:", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "The empty host is not valid for special schemes", + "href": "http://example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net" + } + }, + { + "comment": "The empty host is OK for non-special schemes", + "href": "view-source+http://example.net/foo", + "new_value": "", + "expected": { + "href": "view-source+http:///foo", + "host": "" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "IPv4 address syntax is normalized", + "href": "http://example.net", + "new_value": "0x7F000001:8080", + "expected": { + "href": "http://127.0.0.1:8080/", + "host": "127.0.0.1:8080", + "hostname": "127.0.0.1", + "port": "8080" + } + }, + { + "comment": "IPv6 address syntax is normalized", + "href": "http://example.net", + "new_value": "[::0:01]:2", + "expected": { + "href": "http://[::1]:2/", + "host": "[::1]:2", + "hostname": "[::1]", + "port": "2" + } + }, + { + "comment": "Default port number is removed", + "href": "http://example.net", + "new_value": "example.com:80", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "https://example.net", + "new_value": "example.com:443", + "expected": { + "href": "https://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Default port number is only removed for the relevant scheme", + "href": "https://example.net", + "new_value": "example.com:80", + "expected": { + "href": "https://example.com:80/", + "host": "example.com:80", + "hostname": "example.com", + "port": "80" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com/stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080/stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com?stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080?stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com#stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080#stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "example.com:8080\\stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "\\ is not a delimiter for non-special schemes, and it’s invalid in a domain", + "href": "view-source+http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "view-source+http://example.net/path", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "view-source+http://example.net/path", + "new_value": "example.com:8080stuff2", + "expected": { + "href": "view-source+http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "example.com:8080stuff2", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "example.com:8080+2", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port numbers are 16 bit integers", + "href": "http://example.net/path", + "new_value": "example.com:65535", + "expected": { + "href": "http://example.com:65535/path", + "host": "example.com:65535", + "hostname": "example.com", + "port": "65535" + } + }, + { + "comment": "Port numbers are 16 bit integers, overflowing is an error. Hostname is still set, though.", + "href": "http://example.net/path", + "new_value": "example.com:65536", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + } + ], "hostname": [], "port": [], "pathname": [], From e65ece7926d26288be5bb924db6137285300d615 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 15:30:25 +0200 Subject: [PATCH 74/89] Add hostname setter tests --- tests/setters_tests.json | 171 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 3 deletions(-) diff --git a/tests/setters_tests.json b/tests/setters_tests.json index 7e708487..2d2fc5c8 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -525,7 +525,7 @@ } }, { - "comment": "Stuff after a \\ delimiter is ignored", + "comment": "Stuff after a \\ delimiter is ignored for special schemes", "href": "http://example.net/path", "new_value": "example.com\\stuff", "expected": { @@ -613,8 +613,173 @@ } } ], - "hostname": [], - "port": [], + "hostname": [ + { + "comment": "Cannot-be-a-base means no host", + "href": "mailto:me@example.net", + "new_value": "example.com", + "expected": { + "href": "mailto:me@example.net", + "host": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "data:text/plain,Stuff", + "new_value": "example.net", + "expected": { + "href": "data:text/plain,Stuff", + "host": "" + } + }, + { + "href": "http://example.net:8080", + "new_value": "example.com", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "The empty host is not valid for special schemes", + "href": "http://example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net" + } + }, + { + "comment": "The empty host is OK for non-special schemes", + "href": "view-source+http://example.net/foo", + "new_value": "", + "expected": { + "href": "view-source+http:///foo", + "host": "" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "IPv4 address syntax is normalized", + "href": "http://example.net:8080", + "new_value": "0x7F000001", + "expected": { + "href": "http://127.0.0.1:8080/", + "host": "127.0.0.1:8080", + "hostname": "127.0.0.1", + "port": "8080" + } + }, + { + "comment": "IPv6 address syntax is normalized", + "href": "http://example.net", + "new_value": "[::0:01]", + "expected": { + "href": "http://[::1]/", + "host": "[::1]", + "hostname": "[::1]", + "port": "" + } + }, + { + "comment": "Stuff after a : delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a : delimiter is ignored", + "href": "http://example.net:8080/path", + "new_value": "example.com:", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com/stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com?stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com#stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "\\ is not a delimiter for non-special schemes, and it’s invalid in a domain", + "href": "view-source+http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "view-source+http://example.net/path", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + } + ], + "port": [ + ], "pathname": [], "search": [], "hash": [] From 26bdb3abcfec4d0f85870904f5689b1553fd6e39 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 15:44:33 +0200 Subject: [PATCH 75/89] Port setter tests and fixes --- src/lib.rs | 1 + tests/setters_tests.json | 164 ++++++++++++++++++++++++++++++++++++--- 2 files changed, 154 insertions(+), 11 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index b1cfc86c..cd3b0653 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -813,6 +813,7 @@ impl Url { self.serialization.push_str(&path_and_after); } } + self.port = port; } /// Change this URL’s host. diff --git a/tests/setters_tests.json b/tests/setters_tests.json index 2d2fc5c8..f6700f30 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -356,17 +356,6 @@ "port": "" } }, - { - "comment": "Port number can be removed", - "href": "http://example.net:8080", - "new_value": "example.com:", - "expected": { - "href": "http://example.com/", - "host": "example.com", - "hostname": "example.com", - "port": "" - } - }, { "comment": "The empty host is not valid for special schemes", "href": "http://example.net", @@ -779,6 +768,159 @@ } ], "port": [ + { + "href": "http://example.net", + "new_value": "8080", + "expected": { + "href": "http://example.net:8080/", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Port number is removed if empty in the new value: https://github.com/whatwg/url/pull/113", + "href": "http://example.net:8080", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "http://example.net:8080", + "new_value": "80", + "expected": { + "href": "http://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "https://example.net:4433", + "new_value": "443", + "expected": { + "href": "https://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is only removed for the relevant scheme", + "href": "https://example.net", + "new_value": "80", + "expected": { + "href": "https://example.net:80/", + "host": "example.net:80", + "hostname": "example.net", + "port": "80" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080/stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080?stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080#stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "8080\\stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "view-source+http://example.net/path", + "new_value": "8080stuff2", + "expected": { + "href": "view-source+http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "8080stuff2", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "8080+2", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Port numbers are 16 bit integers", + "href": "http://example.net/path", + "new_value": "65535", + "expected": { + "href": "http://example.net:65535/path", + "host": "example.net:65535", + "hostname": "example.net", + "port": "65535" + } + }, + { + "comment": "Port numbers are 16 bit integers, overflowing is an error", + "href": "http://example.net:8080/path", + "new_value": "65536", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + } ], "pathname": [], "search": [], From 680d93c41da1a39652f22164430e55e6c764ba34 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 16:28:49 +0200 Subject: [PATCH 76/89] pathname setter tests and fixes --- src/parser.rs | 2 +- tests/setters_tests.json | 72 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 77bcc6e0..a2482518 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -748,7 +748,7 @@ impl<'a> Parser<'a> { let mut iter = input.chars(); match iter.next() { Some('/') => input = iter.as_str(), - Some('\\') => { + Some('\\') if scheme_type.is_special() => { self.syntax_violation("backslash"); input = iter.as_str() } diff --git a/tests/setters_tests.json b/tests/setters_tests.json index f6700f30..0e4183c0 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -922,7 +922,77 @@ } } ], - "pathname": [], + "pathname": [ + { + "comment": "Cannot-be-a-base don’t have a path", + "href": "mailto:me@example.net", + "new_value": "/foo", + "expected": { + "href": "mailto:me@example.net", + "pathname": "me@example.net" + } + }, + { + "href": "unix:/run/foo.socket?timeout=10", + "new_value": "/var/log/../run/bar.socket", + "expected": { + "href": "unix:/var/run/bar.socket?timeout=10", + "pathname": "/var/run/bar.socket" + } + }, + { + "href": "https://example.net#nav", + "new_value": "home", + "expected": { + "href": "https://example.net/home#nav", + "pathname": "/home" + } + }, + { + "href": "https://example.net#nav", + "new_value": "../home", + "expected": { + "href": "https://example.net/home#nav", + "pathname": "/home" + } + }, + { + "comment": "\\ is a segment delimiter for 'special' URLs", + "href": "http://example.net/home?lang=fr#nav", + "new_value": "\\a\\%2E\\b\\%2e.\\c", + "expected": { + "href": "http://example.net/a/c?lang=fr#nav", + "pathname": "/a/c" + } + }, + { + "comment": "\\ is *not* a segment delimiter for non-'special' URLs", + "href": "view-source+http://example.net/home?lang=fr#nav", + "new_value": "\\a\\%2E\\b\\%2e.\\c", + "expected": { + "href": "view-source+http://example.net/\\a\\.\\b\\..\\c?lang=fr#nav", + "pathname": "/\\a\\.\\b\\..\\c" + } + }, + { + "comment": "UTF-8 percent encoding with the default encode set. Tabs and newlines are removed.", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9", + "pathname": "/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is, except %2E.", + "href": "http://example.net", + "new_value": "%c3%89t%c3%a9%2e%2E", + "expected": { + "href": "http://example.net/%c3%89t%c3%a9..", + "pathname": "/%c3%89t%c3%a9.." + } + } + ], "search": [], "hash": [] } From de99ede58d9b26dfea3c688f1980593c3c1a4659 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 16:52:24 +0200 Subject: [PATCH 77/89] Add search setter tests --- tests/setters_tests.json | 105 +++++++++++++++++++++++++++++++++++---- 1 file changed, 94 insertions(+), 11 deletions(-) diff --git a/tests/setters_tests.json b/tests/setters_tests.json index 0e4183c0..adaa9c03 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -219,10 +219,10 @@ { "comment": "Bytes already percent-encoded are left as-is.", "href": "http://example.net", - "new_value": "%c3%89t%c3%a9", + "new_value": "%c3%89té", "expected": { - "href": "http://%c3%89t%c3%a9@example.net/", - "username": "%c3%89t%c3%a9" + "href": "http://%c3%89t%C3%A9@example.net/", + "username": "%c3%89t%C3%A9" } } ], @@ -298,10 +298,10 @@ { "comment": "Bytes already percent-encoded are left as-is.", "href": "http://example.net", - "new_value": "%c3%89t%c3%a9", + "new_value": "%c3%89té", "expected": { - "href": "http://:%c3%89t%c3%a9@example.net/", - "password": "%c3%89t%c3%a9" + "href": "http://:%c3%89t%C3%A9@example.net/", + "password": "%c3%89t%C3%A9" } } ], @@ -986,13 +986,96 @@ { "comment": "Bytes already percent-encoded are left as-is, except %2E.", "href": "http://example.net", - "new_value": "%c3%89t%c3%a9%2e%2E", + "new_value": "%2e%2E%c3%89té", "expected": { - "href": "http://example.net/%c3%89t%c3%a9..", - "pathname": "/%c3%89t%c3%a9.." + "href": "http://example.net/..%c3%89t%C3%A9", + "pathname": "/..%c3%89t%C3%A9" } } - ], - "search": [], + ], + "search": [ + { + "href": "https://example.net#nav", + "new_value": "lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "?lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "??lang=fr", + "expected": { + "href": "https://example.net/??lang=fr#nav", + "search": "??lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "?", + "expected": { + "href": "https://example.net/?#nav", + "search": "" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "", + "expected": { + "href": "https://example.net/#nav", + "search": "" + } + }, + { + "href": "https://example.net?lang=en-US", + "new_value": "", + "expected": { + "href": "https://example.net/", + "search": "" + } + }, + { + "href": "https://example.net", + "new_value": "", + "expected": { + "href": "https://example.net/", + "search": "" + } + }, + { + "comment": "UTF-8 percent encoding with the query encode set. Tabs and newlines are removed.", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "search": "?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://example.net/?%c3%89t%C3%A9", + "search": "?%c3%89t%C3%A9" + } + } + ], "hash": [] } From d6da85d73752efab26e6fa7b166be2321248f736 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 13 Apr 2016 17:11:08 +0200 Subject: [PATCH 78/89] Add tests for hash setter. --- tests/setters_tests.json | 69 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/tests/setters_tests.json b/tests/setters_tests.json index adaa9c03..b60e49a9 100644 --- a/tests/setters_tests.json +++ b/tests/setters_tests.json @@ -1077,5 +1077,72 @@ } } ], - "hash": [] + "hash": [ + { + "href": "https://example.net", + "new_value": "main", + "expected": { + "href": "https://example.net/#main", + "hash": "#main" + } + }, + { + "href": "https://example.net#nav", + "new_value": "main", + "expected": { + "href": "https://example.net/#main", + "hash": "#main" + } + }, + { + "href": "https://example.net?lang=en-US", + "new_value": "##nav", + "expected": { + "href": "https://example.net/?lang=en-US##nav", + "hash": "##nav" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "#main", + "expected": { + "href": "https://example.net/?lang=en-US#main", + "hash": "#main" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "#", + "expected": { + "href": "https://example.net/?lang=en-US#", + "hash": "" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "", + "expected": { + "href": "https://example.net/?lang=en-US", + "hash": "" + } + }, + { + "comment": "No percent-encoding at all (!); nuls, tabs, and newlines are removed", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/#\u0001\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "hash": "#\u0001\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://example.net/#%c3%89té", + "hash": "#%c3%89té" + } + } + ] } From 1fdd0194d4ef8356f82b5a3f7f7a4bf842f007d2 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 15:06:01 +0200 Subject: [PATCH 79/89] Rename append_pair_iter to extend_pairs --- src/form_urlencoded.rs | 2 +- tests/unit.rs | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index c9f4156a..4c574891 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -291,7 +291,7 @@ impl Serializer { /// to limit the scope of `Serializer`’s borrow of its string. /// /// Panics if called after `.finish()`. - pub fn append_pair_iter(&mut self, iter: I) -> &mut Self + pub fn extend_pairs(&mut self, iter: I) -> &mut Self where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { { let string = string(&mut self.target); diff --git a/tests/unit.rs b/tests/unit.rs index 2c458cae..44dbd64e 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -66,8 +66,8 @@ fn new_path_windows_fun() { // Invalid UTF-8 assert!(Url::parse("file:///C:/foo/ba%80r").unwrap().to_file_path().is_err()); - - // test windows canonicalized path + + // test windows canonicalized path let path = PathBuf::from(r"\\?\C:\foo\bar"); assert!(Url::from_file_path(path).is_ok()); } @@ -219,7 +219,7 @@ fn test_form_urlencoded() { ("bar".into(), "".into()), ("foo".into(), "#".into()) ]; - let encoded = form_urlencoded::Serializer::new(String::new()).append_pair_iter(pairs).finish(); + let encoded = form_urlencoded::Serializer::new(String::new()).extend_pairs(pairs).finish(); assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); assert_eq!(form_urlencoded::parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); } From 8bc3285ccc9fd199485bacb098493a59f15277bd Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 18:22:31 +0200 Subject: [PATCH 80/89] Typo fix. --- idna/src/uts46.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idna/src/uts46.rs b/idna/src/uts46.rs index 5f230e0e..1570ab33 100644 --- a/idna/src/uts46.rs +++ b/idna/src/uts46.rs @@ -266,7 +266,7 @@ enum Error { /// Errors recorded during UTS #46 processing. /// -/// This is opaque for now, only indicating the precense of at least one error. +/// This is opaque for now, only indicating the presence of at least one error. /// More details may be exposed in the future. #[derive(Debug)] pub struct Errors(Vec); From adba760e296790e98beda1bda5fa5b183d983032 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 18:24:41 +0200 Subject: [PATCH 81/89] Copyright date Does it matter? --- src/form_urlencoded.rs | 2 +- src/host.rs | 2 +- src/parser.rs | 2 +- src/percent_encoding.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 4c574891..2b5fb40c 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2015 Simon Sapin. +// Copyright 2013-2016 Simon Sapin. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/host.rs b/src/host.rs index 99f8cae4..093cb5ba 100644 --- a/src/host.rs +++ b/src/host.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 Simon Sapin. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/parser.rs b/src/parser.rs index a2482518..50bf0107 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 Simon Sapin. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index 588b4ba2..f91402d4 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 Simon Sapin. // // Licensed under the Apache License, Version 2.0 or the MIT license From cd9fda8e30d1efaf9b70aa50376b056db9f1243a Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 18:25:31 +0200 Subject: [PATCH 82/89] Fix/clarify comments, and add a debug_assert! --- src/encoding.rs | 11 +++++++++-- src/lib.rs | 8 +++++--- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/src/encoding.rs b/src/encoding.rs index 4564e340..ed88bb80 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -74,6 +74,7 @@ impl EncodingOverride { pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { match self.encoding { + // `encoding.decode` never returns `Err` when called with `DecoderTrap::Replace` Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), None => decode_utf8_lossy(input), } @@ -81,6 +82,7 @@ impl EncodingOverride { pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { match self.encoding { + // `encoding.encode` never returns `Err` when called with `EncoderTrap::NcrEscape` Some(encoding) => Cow::Owned(encoding.encode(&input, EncoderTrap::NcrEscape).unwrap()), None => encode_utf8(input) } @@ -112,10 +114,15 @@ pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { match input { Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), Cow::Owned(bytes) => { + let raw_utf8: *const [u8]; match String::from_utf8_lossy(&bytes) { - Cow::Borrowed(_) => unsafe { String::from_utf8_unchecked(bytes) }.into(), - Cow::Owned(s) => s.into(), + Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), + Cow::Owned(s) => return s.into(), } + // from_utf8_lossy returned a borrow of `bytes` unchanged. + debug_assert!(raw_utf8 == &*bytes as *const [u8]); + // Reuse the existing `Vec` allocation. + unsafe { String::from_utf8_unchecked(bytes) }.into() } } } diff --git a/src/lib.rs b/src/lib.rs index cd3b0653..89c63e97 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -503,9 +503,9 @@ impl Url { } /// Return the path for this URL, as a percent-encoded ASCII string. - /// For relative URLs, this starts with a '/' slash - /// and continues with slash-separated path segments. /// For cannot-be-a-base URLs, this is an arbitrary string that doesn’t start with '/'. + /// For other URLs, this starts with a '/' slash + /// and continues with slash-separated path segments. pub fn path(&self) -> &str { match (self.query_start, self.fragment_start) { (None, None) => self.slice(self.path_start..), @@ -516,7 +516,8 @@ impl Url { } } - /// If this URL is relative, return an iterator of '/' slash-separated path segments, + /// Unless this URL is cannot-be-a-base, + /// return an iterator of '/' slash-separated path segments, /// each as a percent-encoded ASCII string. /// /// Return `None` for cannot-be-a-base URLs, or an iterator of at least one string. @@ -718,6 +719,7 @@ impl Url { /// Remove the last segment of this URL’s path. /// /// If this URL is cannot-be-a-base, do nothing and return `Err`. + /// If this URL is not cannot-be-a-base and its path is `/`, do nothing and return `Ok`. pub fn pop_path_segment(&mut self) -> Result<(), ()> { if self.cannot_be_a_base() { return Err(()) From 806bab012aece280d7b54f2c4a3645fe6cb73518 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 18:26:39 +0200 Subject: [PATCH 83/89] form_urlencoded::Parse::into_owned returns a dedicated type This avoids a potential dynamic dispatch for the mapping function. --- src/form_urlencoded.rs | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 2b5fb40c..190ba308 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -16,7 +16,6 @@ use encoding::EncodingOverride; use percent_encoding::{percent_encode_byte, percent_decode}; use std::borrow::{Borrow, Cow}; -use std::iter; use std::str; @@ -137,12 +136,22 @@ fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { } impl<'a> Parse<'a> { - /// Return a new iterator that yields pairs of `String` instead of pair of `Cow`. - pub fn into_owned(self) -> iter::Map, fn((Cow, Cow)) -> (String, String)> { - fn into_owned((k, v): (Cow, Cow)) -> (String, String) { - (k.into_owned(), v.into_owned()) - } - self.map(into_owned) + /// Return a new iterator that yields pairs of `String` instead of pairs of `Cow`. + pub fn into_owned(self) -> ParseIntoOwned<'a> { + ParseIntoOwned { inner: self } + } +} + +/// Like `Parse`, but yields pairs of `String` instead of pairs of `Cow`. +pub struct ParseIntoOwned<'a> { + inner: Parse<'a> +} + +impl<'a> Iterator for ParseIntoOwned<'a> { + type Item = (String, String); + + fn next(&mut self) -> Option { + self.inner.next().map(|(k, v)| (k.into_owned(), v.into_owned())) } } From 9602247880f6b99f0e596adf11d7dd7a38cfe2be Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 19 Apr 2016 18:27:14 +0200 Subject: [PATCH 84/89] Use an atomic counter rather than allocation to make opaque origin unique. --- src/origin.rs | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/src/origin.rs b/src/origin.rs index 37ec8b0f..49b8d87d 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -9,11 +9,14 @@ use host::Host; use idna::domain_to_unicode; use parser::default_port; -use std::sync::Arc; +use std::sync::atomic::{AtomicUsize, ATOMIC_USIZE_INIT, Ordering}; use Url; impl Url { /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) + /// + /// Note: this return an opaque origin for `file:` URLs, which causes + /// `url.origin() != url.origin()`. pub fn origin(&self) -> Origin { let scheme = self.scheme(); match scheme { @@ -46,10 +49,12 @@ pub enum Origin { Tuple(String, Host, u16) } + impl Origin { /// Creates a new opaque origin that is only equal to itself. pub fn new_opaque() -> Origin { - Origin::Opaque(OpaqueOrigin(Arc::new(0))) + static COUNTER: AtomicUsize = ATOMIC_USIZE_INIT; + Origin::Opaque(OpaqueOrigin(COUNTER.fetch_add(1, Ordering::SeqCst))) } /// Return whether this origin is a (scheme, host, port) tuple @@ -95,17 +100,6 @@ impl Origin { } /// Opaque identifier for URLs that have file or other schemes -#[derive(Eq, Clone, Debug)] +#[derive(Eq, PartialEq, Clone, Debug)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -// `u8` is a dummy non-zero-sized type to force the allocator to return a unique pointer. -// (It returns `std::heap::EMPTY` for zero-sized allocations.) -pub struct OpaqueOrigin(Arc); - -/// Note that `opaque_origin.clone() != opaque_origin`. -impl PartialEq for OpaqueOrigin { - fn eq(&self, other: &Self) -> bool { - let a: *const u8 = &*self.0; - let b: *const u8 = &*other.0; - a == b - } -} +pub struct OpaqueOrigin(usize); From 3f92946af3d03a02e4bb6b8c04546a7d94c0e169 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 20 Apr 2016 13:43:26 +0200 Subject: [PATCH 85/89] Fewer magic numbers. --- src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 89c63e97..6d266dc9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -361,7 +361,7 @@ impl Url { #[inline] pub fn has_authority(&self) -> bool { debug_assert!(self.byte_at(self.scheme_end) == b':'); - self.slice(self.scheme_end + 1 ..).starts_with("//") + self.slice(self.scheme_end..).starts_with("://") } /// Return whether this URL is a cannot-be-a-base URL, @@ -378,7 +378,7 @@ impl Url { /// as a percent-encoded ASCII string. pub fn username(&self) -> &str { if self.has_authority() { - self.slice(self.scheme_end + 3..self.username_end) + self.slice(self.scheme_end + ("://".len() as u32)..self.username_end) } else { "" } From e4035794b09746a4da7fa8f2050e4740ebe004f5 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 20 Apr 2016 13:47:11 +0200 Subject: [PATCH 86/89] More detailed error type for Url::set_host --- src/lib.rs | 6 +++--- src/parser.rs | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 6d266dc9..277b996b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -825,13 +825,13 @@ impl Url { /// /// Removing the host (calling this with `None`) /// will also remove any username, password, and port number. - pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ()> { + pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ParseError> { if self.cannot_be_a_base() { - return Err(()) + return Err(ParseError::SetHostOnCannotBeABaseUrl) } if let Some(host) = host { - self.set_host_internal(try!(Host::parse(host).map_err(|_| ())), None) + self.set_host_internal(try!(Host::parse(host)), None) } else if self.has_host() { debug_assert!(self.byte_at(self.scheme_end) == b':'); debug_assert!(self.byte_at(self.path_start) == b'/'); diff --git a/src/parser.rs b/src/parser.rs index 50bf0107..04dd445f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -52,6 +52,7 @@ simple_enum_error! { InvalidDomainCharacter => "invalid domain character", RelativeUrlWithoutBase => "relative URL without a base", RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base", + SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set", Overflow => "URLs more than 4 GB are not supported", } From f6996c8d33b8bc354ad609da8bbfc4a56e128559 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 20 Apr 2016 13:55:57 +0200 Subject: [PATCH 87/89] Have a single `impl Url` block with public methods. Otherwise they show up separately in rustdoc. --- src/lib.rs | 36 +++++++++++++++++++++--------------- src/origin.rs | 40 +++++++++++++++++----------------------- 2 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 277b996b..5960e4d4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -347,6 +347,15 @@ impl Url { } } + /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) + /// + /// Note: this return an opaque origin for `file:` URLs, which causes + /// `url.origin() != url.origin()`. + #[inline] + pub fn origin(&self) -> Origin { + origin::url_origin(self) + } + /// Return the scheme of this URL, lower-cased, as an ASCII string without the ':' delimiter. #[inline] pub fn scheme(&self) -> &str { @@ -668,22 +677,7 @@ impl Url { let query = UrlQuery { url: self, fragment: fragment }; form_urlencoded::Serializer::for_suffix(query, query_start + "?".len()) } -} - - -/// Implementation detail of `Url::mutate_query_pairs`. Typically not used directly. -pub struct UrlQuery<'a> { - url: &'a mut Url, - fragment: Option, -} -impl<'a> Drop for UrlQuery<'a> { - fn drop(&mut self) { - self.url.restore_already_parsed_fragment(self.fragment.take()) - } -} - -impl Url { /// Change this URL’s path. pub fn set_path(&mut self, path: &str) { let (old_after_path_pos, after_path) = match (self.query_start, self.fragment_start) { @@ -1365,3 +1359,15 @@ fn file_url_segments_to_pathbuf_windows(mut segments: str::Split) -> Resul fn io_error(reason: &str) -> io::Result { Err(io::Error::new(io::ErrorKind::InvalidData, reason)) } + +/// Implementation detail of `Url::mutate_query_pairs`. Typically not used directly. +pub struct UrlQuery<'a> { + url: &'a mut Url, + fragment: Option, +} + +impl<'a> Drop for UrlQuery<'a> { + fn drop(&mut self) { + self.url.restore_already_parsed_fragment(self.fragment.take()) + } +} diff --git a/src/origin.rs b/src/origin.rs index 49b8d87d..da2039cf 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -12,29 +12,23 @@ use parser::default_port; use std::sync::atomic::{AtomicUsize, ATOMIC_USIZE_INIT, Ordering}; use Url; -impl Url { - /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) - /// - /// Note: this return an opaque origin for `file:` URLs, which causes - /// `url.origin() != url.origin()`. - pub fn origin(&self) -> Origin { - let scheme = self.scheme(); - match scheme { - "blob" => { - let result = Url::parse(self.path()); - match result { - Ok(ref url) => url.origin(), - Err(_) => Origin::new_opaque() - } - }, - "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { - Origin::Tuple(scheme.to_owned(), self.host().unwrap().to_owned(), - self.port_or_known_default().unwrap()) - }, - // TODO: Figure out what to do if the scheme is a file - "file" => Origin::new_opaque(), - _ => Origin::new_opaque() - } +pub fn url_origin(url: &Url) -> Origin { + let scheme = url.scheme(); + match scheme { + "blob" => { + let result = Url::parse(url.path()); + match result { + Ok(ref url) => url_origin(url), + Err(_) => Origin::new_opaque() + } + }, + "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { + Origin::Tuple(scheme.to_owned(), url.host().unwrap().to_owned(), + url.port_or_known_default().unwrap()) + }, + // TODO: Figure out what to do if the scheme is a file + "file" => Origin::new_opaque(), + _ => Origin::new_opaque() } } From 18c0806136f255eb55d21ad39cc91f16822798b4 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 20 Apr 2016 22:59:22 +0200 Subject: [PATCH 88/89] 1.0.0 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 6af1dbe3..da2c3575 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "url" -version = "1.0.0-dev" +version = "1.0.0" authors = [ "Simon Sapin " ] description = "URL library for Rust, based on the WHATWG URL Standard" From 4a59d935fccd69366f46b64cd2928f894180b740 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Wed, 20 Apr 2016 23:17:00 +0200 Subject: [PATCH 89/89] =?UTF-8?q?Let=E2=80=99s=20not=20try=20to=20manually?= =?UTF-8?q?=20maintain=20a=20list=20of=20authors.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit See `git shortlog`. --- Cargo.toml | 2 +- LICENSE-MIT | 3 +-- idna/Cargo.toml | 2 +- idna/src/lib.rs | 2 +- idna/src/make_uts46_mapping_table.py | 4 ++-- idna/src/punycode.rs | 2 +- idna/src/uts46.rs | 2 +- idna/src/uts46_mapping_table.rs | 2 +- idna/tests/punycode.rs | 2 +- idna/tests/uts46.rs | 2 +- src/encoding.rs | 2 +- src/form_urlencoded.rs | 2 +- src/host.rs | 2 +- src/lib.rs | 2 +- src/origin.rs | 2 +- src/parser.rs | 2 +- src/percent_encoding.rs | 2 +- src/quirks.rs | 2 +- src/slicing.rs | 2 +- tests/data.rs | 2 +- tests/unit.rs | 2 +- 21 files changed, 22 insertions(+), 23 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index da2c3575..907a529f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ name = "url" version = "1.0.0" -authors = [ "Simon Sapin " ] +authors = ["The rust-url developers"] description = "URL library for Rust, based on the WHATWG URL Standard" documentation = "http://servo.github.io/rust-url/url/index.html" diff --git a/LICENSE-MIT b/LICENSE-MIT index 1da3a5f6..24de6b41 100644 --- a/LICENSE-MIT +++ b/LICENSE-MIT @@ -1,5 +1,4 @@ -Copyright (c) 2006-2009 Graydon Hoare -Copyright (c) 2009-2013 Mozilla Foundation +Copyright (c) 2013-2016 The rust-url developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated diff --git a/idna/Cargo.toml b/idna/Cargo.toml index cc7a8d22..db532fa4 100644 --- a/idna/Cargo.toml +++ b/idna/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "idna" version = "0.1.0" -authors = ["Simon Sapin "] +authors = ["The rust-url developers"] description = "IDNA (Internationalizing Domain Names in Applications) and Punycode." repository = "https://github.com/servo/rust-url/" license = "MIT/Apache-2.0" diff --git a/idna/src/lib.rs b/idna/src/lib.rs index d53874f3..bdeafe44 100644 --- a/idna/src/lib.rs +++ b/idna/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2016 Simon Sapin. +// Copyright 2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/src/make_uts46_mapping_table.py b/idna/src/make_uts46_mapping_table.py index 8e090dc7..d4554e5b 100644 --- a/idna/src/make_uts46_mapping_table.py +++ b/idna/src/make_uts46_mapping_table.py @@ -1,4 +1,4 @@ -# Copyright 2013-2014 Valentin Gosu. +# Copyright 2013-2014 The rust-url developers. # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -11,7 +11,7 @@ # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt print('''\ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/src/punycode.rs b/idna/src/punycode.rs index 9e5f1769..75bb1d6e 100644 --- a/idna/src/punycode.rs +++ b/idna/src/punycode.rs @@ -1,4 +1,4 @@ -// Copyright 2013 Simon Sapin. +// Copyright 2013 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/src/uts46.rs b/idna/src/uts46.rs index 1570ab33..bfe12ff2 100644 --- a/idna/src/uts46.rs +++ b/idna/src/uts46.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/src/uts46_mapping_table.rs b/idna/src/uts46_mapping_table.rs index eb57dfb2..dd1bdaae 100644 --- a/idna/src/uts46_mapping_table.rs +++ b/idna/src/uts46_mapping_table.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/tests/punycode.rs b/idna/tests/punycode.rs index b72c0aba..67988e80 100644 --- a/idna/tests/punycode.rs +++ b/idna/tests/punycode.rs @@ -1,4 +1,4 @@ -// Copyright 2013 Simon Sapin. +// Copyright 2013 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/idna/tests/uts46.rs b/idna/tests/uts46.rs index 038fdf45..f660559f 100644 --- a/idna/tests/uts46.rs +++ b/idna/tests/uts46.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/encoding.rs b/src/encoding.rs index ed88bb80..0703c788 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 190ba308..16fbeb51 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2016 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/host.rs b/src/host.rs index 093cb5ba..47ebbd79 100644 --- a/src/host.rs +++ b/src/host.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2016 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/lib.rs b/src/lib.rs index 5960e4d4..5feba5b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2015 Simon Sapin. +// Copyright 2013-2015 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/origin.rs b/src/origin.rs index da2039cf..a78b939f 100644 --- a/src/origin.rs +++ b/src/origin.rs @@ -1,4 +1,4 @@ -// Copyright 2016 Simon Sapin. +// Copyright 2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/parser.rs b/src/parser.rs index 04dd445f..39879de5 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2016 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index f91402d4..72b47245 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2016 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/quirks.rs b/src/quirks.rs index 55bef503..3e25ac20 100644 --- a/src/quirks.rs +++ b/src/quirks.rs @@ -1,4 +1,4 @@ -// Copyright 2016 Simon Sapin. +// Copyright 2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/slicing.rs b/src/slicing.rs index 5a303e4b..926f3c79 100644 --- a/src/slicing.rs +++ b/src/slicing.rs @@ -1,4 +1,4 @@ -// Copyright 2016 Simon Sapin. +// Copyright 2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/tests/data.rs b/tests/data.rs index 61c893f1..b8945aa4 100644 --- a/tests/data.rs +++ b/tests/data.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/tests/unit.rs b/tests/unit.rs index 44dbd64e..6038e1f9 100644 --- a/tests/unit.rs +++ b/tests/unit.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license