diff --git a/.gitignore b/.gitignore index 7cbe84a5..0284c25c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -/target -/Cargo.lock +target +Cargo.lock /.cargo/config diff --git a/Cargo.toml b/Cargo.toml index b2458c0a..907a529f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,8 +1,8 @@ [package] name = "url" -version = "0.5.9" -authors = [ "Simon Sapin " ] +version = "1.0.0" +authors = ["The rust-url developers"] description = "URL library for Rust, based on the WHATWG URL Standard" documentation = "http://servo.github.io/rust-url/url/index.html" @@ -12,46 +12,28 @@ keywords = ["url", "parser"] license = "MIT/Apache-2.0" [[test]] -name = "format" -[[test]] -name = "form_urlencoded" -[[test]] -name = "idna" -[[test]] -name = "punycode" -[[test]] -name = "tests" +name = "unit" + [[test]] -name = "wpt" +name = "data" harness = false +[lib] +test = false + [dev-dependencies] rustc-test = "0.1" +rustc-serialize = "0.3" [features] query_encoding = ["encoding"] -serde_serialization = ["serde"] heap_size = ["heapsize", "heapsize_plugin"] -[dependencies.heapsize] -version = ">=0.1.1, <0.4" -optional = true - -[dependencies.heapsize_plugin] -version = "0.1.0" -optional = true - -[dependencies.encoding] -version = "0.2" -optional = true - -[dependencies.serde] -version = ">=0.6.1, <0.8" -optional = true - [dependencies] -uuid = { version = "0.2", features = ["v4"] } -rustc-serialize = "0.3" -unicode-bidi = "0.2.3" -unicode-normalization = "0.1.2" +idna = { version = "0.1.0", path = "./idna" } +heapsize = {version = ">=0.1.1, <0.4", optional = true} +heapsize_plugin = {version = "0.1.0", optional = true} +encoding = {version = "0.2", optional = true} +serde = {version = ">=0.6.1, <0.8", optional = true} +rustc-serialize = {version = "0.3", optional = true} matches = "0.1" diff --git a/LICENSE-MIT b/LICENSE-MIT index 1da3a5f6..24de6b41 100644 --- a/LICENSE-MIT +++ b/LICENSE-MIT @@ -1,5 +1,4 @@ -Copyright (c) 2006-2009 Graydon Hoare -Copyright (c) 2009-2013 Mozilla Foundation +Copyright (c) 2013-2016 The rust-url developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated diff --git a/Makefile b/Makefile index e46603be..f76adfe1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,5 @@ test: - cargo test --features query_encoding - cargo test --features serde_serialization - cargo test + cargo test --features "query_encoding serde rustc-serialize" [ x$$TRAVIS_RUST_VERSION != xnightly ] || cargo test --features heap_size doc: diff --git a/idna/Cargo.toml b/idna/Cargo.toml new file mode 100644 index 00000000..db532fa4 --- /dev/null +++ b/idna/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "idna" +version = "0.1.0" +authors = ["The rust-url developers"] +description = "IDNA (Internationalizing Domain Names in Applications) and Punycode." +repository = "https://github.com/servo/rust-url/" +license = "MIT/Apache-2.0" + +[lib] +doctest = false +test = false + +[[test]] +name = "tests" +harness = false + +[dev-dependencies] +rustc-test = "0.1" +rustc-serialize = "0.3" + +[dependencies] +unicode-bidi = "0.2.3" +unicode-normalization = "0.1.2" +matches = "0.1" diff --git a/IdnaMappingTable.txt b/idna/src/IdnaMappingTable.txt similarity index 100% rename from IdnaMappingTable.txt rename to idna/src/IdnaMappingTable.txt diff --git a/idna/src/lib.rs b/idna/src/lib.rs new file mode 100644 index 00000000..bdeafe44 --- /dev/null +++ b/idna/src/lib.rs @@ -0,0 +1,73 @@ +// Copyright 2016 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! This Rust crate implements IDNA +//! [per the WHATWG URL Standard](https://url.spec.whatwg.org/#idna). +//! +//! It also exposes the underlying algorithms from [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) +//! and [Punycode (RFC 3492)](https://tools.ietf.org/html/rfc3492). +//! +//! Quoting from [UTS #46’s introduction](http://www.unicode.org/reports/tr46/#Introduction): +//! +//! > Initially, domain names were restricted to ASCII characters. +//! > A system was introduced in 2003 for internationalized domain names (IDN). +//! > This system is called Internationalizing Domain Names for Applications, +//! > or IDNA2003 for short. +//! > This mechanism supports IDNs by means of a client software transformation +//! > into a format known as Punycode. +//! > A revision of IDNA was approved in 2010 (IDNA2008). +//! > This revision has a number of incompatibilities with IDNA2003. +//! > +//! > The incompatibilities force implementers of client software, +//! > such as browsers and emailers, +//! > to face difficult choices during the transition period +//! > as registries shift from IDNA2003 to IDNA2008. +//! > This document specifies a mechanism +//! > that minimizes the impact of this transition for client software, +//! > allowing client software to access domains that are valid under either system. + +#[macro_use] extern crate matches; +extern crate unicode_bidi; +extern crate unicode_normalization; + +pub mod punycode; +pub mod uts46; + +/// The [domain to ASCII](https://url.spec.whatwg.org/#concept-domain-to-ascii) algorithm. +/// +/// Return the ASCII representation a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and using Punycode as necessary. +/// +/// This process may fail. +pub fn domain_to_ascii(domain: &str) -> Result { + uts46::to_ascii(domain, uts46::Flags { + use_std3_ascii_rules: false, + transitional_processing: true, // XXX: switch when Firefox does + verify_dns_length: false, + }) +} + +/// The [domain to Unicode](https://url.spec.whatwg.org/#concept-domain-to-unicode) algorithm. +/// +/// Return the Unicode representation of a domain name, +/// normalizing characters (upper-case to lower-case and other kinds of equivalence) +/// and decoding Punycode as necessary. +/// +/// This may indicate [syntax violations](https://url.spec.whatwg.org/#syntax-violation) +/// but always returns a string for the mapped domain. +pub fn domain_to_unicode(domain: &str) -> (String, Result<(), uts46::Errors>) { + uts46::to_unicode(domain, uts46::Flags { + use_std3_ascii_rules: false, + + // Unused: + transitional_processing: true, + verify_dns_length: false, + }) +} diff --git a/make_idna_table.py b/idna/src/make_uts46_mapping_table.py similarity index 90% rename from make_idna_table.py rename to idna/src/make_uts46_mapping_table.py index 5700d680..d4554e5b 100644 --- a/make_idna_table.py +++ b/idna/src/make_uts46_mapping_table.py @@ -1,4 +1,4 @@ -# Copyright 2013-2014 Valentin Gosu. +# Copyright 2013-2014 The rust-url developers. # # Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,13 +6,12 @@ # option. This file may not be copied, modified, or distributed # except according to those terms. - -# Run as: python make_idna_table.py idna_table.txt > src/idna_table.rs +# Run as: python make_uts46_mapping_table.py IdnaMappingTable.txt > uts46_mapping_table.rs # You can get the latest idna table from # http://www.unicode.org/Public/idna/latest/IdnaMappingTable.txt print('''\ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/src/punycode.rs b/idna/src/punycode.rs similarity index 94% rename from src/punycode.rs rename to idna/src/punycode.rs index 27525faf..75bb1d6e 100644 --- a/src/punycode.rs +++ b/idna/src/punycode.rs @@ -1,4 +1,4 @@ -// Copyright 2013 Simon Sapin. +// Copyright 2013 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -185,11 +185,11 @@ pub fn encode(input: &[char]) -> Option { break } let value = t + ((q - t) % (BASE - t)); - value_to_digit(value, &mut output); + output.push(value_to_digit(value)); q = (q - t) / (BASE - t); k += BASE; } - value_to_digit(q, &mut output); + output.push(value_to_digit(q)); bias = adapt(delta, processed + 1, processed == basic_length); delta = 0; processed += 1; @@ -203,11 +203,10 @@ pub fn encode(input: &[char]) -> Option { #[inline] -fn value_to_digit(value: u32, output: &mut String) { - let code_point = match value { - 0 ... 25 => value + 0x61, // a..z - 26 ... 35 => value - 26 + 0x30, // 0..9 +fn value_to_digit(value: u32) -> char { + match value { + 0 ... 25 => (value as u8 + 'a' as u8) as char, // a..z + 26 ... 35 => (value as u8 - 26 + '0' as u8) as char, // 0..9 _ => panic!() - }; - unsafe { output.as_mut_vec().push(code_point as u8) } + } } diff --git a/src/idna.rs b/idna/src/uts46.rs similarity index 84% rename from src/idna.rs rename to idna/src/uts46.rs index e0efdb39..bfe12ff2 100644 --- a/src/idna.rs +++ b/idna/src/uts46.rs @@ -1,6 +1,13 @@ -//! International domain names -//! -//! https://url.spec.whatwg.org/#idna +// Copyright 2013-2014 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! [*Unicode IDNA Compatibility Processing* +//! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/) use self::Mapping::*; use punycode; @@ -9,7 +16,7 @@ use unicode_normalization::UnicodeNormalization; use unicode_normalization::char::is_combining_mark; use unicode_bidi::{BidiClass, bidi_class}; -include!("idna_mapping.rs"); +include!("uts46_mapping_table.rs"); #[derive(Debug)] enum Mapping { @@ -23,9 +30,9 @@ enum Mapping { } struct Range { - pub from: char, - pub to: char, - pub mapping: Mapping, + from: char, + to: char, + mapping: Mapping, } fn find_char(codepoint: char) -> &'static Mapping { @@ -45,7 +52,7 @@ fn find_char(codepoint: char) -> &'static Mapping { &TABLE[min].mapping } -fn map_char(codepoint: char, flags: Uts46Flags, output: &mut String, errors: &mut Vec) { +fn map_char(codepoint: char, flags: Flags, output: &mut String, errors: &mut Vec) { match *find_char(codepoint) { Mapping::Valid => output.push(codepoint), Mapping::Ignored => {}, @@ -185,7 +192,7 @@ fn passes_bidi(label: &str, transitional_processing: bool) -> bool { } /// http://www.unicode.org/reports/tr46/#Validity_Criteria -fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { +fn validate(label: &str, flags: Flags, errors: &mut Vec) { if label.nfc().ne(label.chars()) { errors.push(Error::ValidityCriteria); } @@ -212,7 +219,7 @@ fn validate(label: &str, flags: Uts46Flags, errors: &mut Vec) { } /// http://www.unicode.org/reports/tr46/#Processing -fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> String { +fn processing(domain: &str, flags: Flags, errors: &mut Vec) -> String { let mut mapped = String::new(); for c in domain.chars() { map_char(c, flags, &mut mapped, errors) @@ -226,7 +233,7 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> if label.starts_with("xn--") { match punycode::decode_to_string(&label["xn--".len()..]) { Some(decoded_label) => { - let flags = Uts46Flags { transitional_processing: false, ..flags }; + let flags = Flags { transitional_processing: false, ..flags }; validate(&decoded_label, flags, errors); validated.push_str(&decoded_label) } @@ -241,14 +248,14 @@ fn uts46_processing(domain: &str, flags: Uts46Flags, errors: &mut Vec) -> } #[derive(Copy, Clone)] -pub struct Uts46Flags { +pub struct Flags { pub use_std3_ascii_rules: bool, pub transitional_processing: bool, pub verify_dns_length: bool, } #[derive(PartialEq, Eq, Clone, Copy, Debug)] -pub enum Error { +enum Error { PunycodeError, ValidityCriteria, DissallowedByStd3AsciiRules, @@ -257,11 +264,18 @@ pub enum Error { TooLongForDns, } +/// Errors recorded during UTS #46 processing. +/// +/// This is opaque for now, only indicating the presence of at least one error. +/// More details may be exposed in the future. +#[derive(Debug)] +pub struct Errors(Vec); + /// http://www.unicode.org/reports/tr46/#ToASCII -pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result> { +pub fn to_ascii(domain: &str, flags: Flags) -> Result { let mut errors = Vec::new(); let mut result = String::new(); - for label in uts46_processing(domain, flags, &mut errors).split('.') { + for label in processing(domain, flags, &mut errors).split('.') { if result.len() > 0 { result.push('.'); } @@ -288,36 +302,21 @@ pub fn uts46_to_ascii(domain: &str, flags: Uts46Flags) -> Result Result> { - uts46_to_ascii(domain, Uts46Flags { - use_std3_ascii_rules: false, - transitional_processing: true, // XXX: switch when Firefox does - verify_dns_length: false, - }) -} - /// http://www.unicode.org/reports/tr46/#ToUnicode /// /// Only `use_std3_ascii_rules` is used in `flags`. -pub fn uts46_to_unicode(domain: &str, mut flags: Uts46Flags) -> (String, Vec) { +pub fn to_unicode(domain: &str, mut flags: Flags) -> (String, Result<(), Errors>) { flags.transitional_processing = false; let mut errors = Vec::new(); - let domain = uts46_processing(domain, flags, &mut errors); + let domain = processing(domain, flags, &mut errors); + let errors = if errors.is_empty() { + Ok(()) + } else { + Err(Errors(errors)) + }; (domain, errors) } - -/// https://url.spec.whatwg.org/#concept-domain-to-unicode -pub fn domain_to_unicode(domain: &str) -> (String, Vec) { - uts46_to_unicode(domain, Uts46Flags { - use_std3_ascii_rules: false, - - // Unused: - transitional_processing: true, - verify_dns_length: false, - }) -} diff --git a/src/idna_mapping.rs b/idna/src/uts46_mapping_table.rs similarity index 99% rename from src/idna_mapping.rs rename to idna/src/uts46_mapping_table.rs index eb57dfb2..dd1bdaae 100644 --- a/src/idna_mapping.rs +++ b/idna/src/uts46_mapping_table.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Valentin Gosu. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license diff --git a/tests/IdnaTest.txt b/idna/tests/IdnaTest.txt similarity index 100% rename from tests/IdnaTest.txt rename to idna/tests/IdnaTest.txt diff --git a/idna/tests/punycode.rs b/idna/tests/punycode.rs new file mode 100644 index 00000000..67988e80 --- /dev/null +++ b/idna/tests/punycode.rs @@ -0,0 +1,65 @@ +// Copyright 2013 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use idna::punycode::{decode, encode_str}; +use rustc_serialize::json::{Json, Object}; +use test::TestFn; + +fn one_test(decoded: &str, encoded: &str) { + match decode(encoded) { + None => panic!("Decoding {} failed.", encoded), + Some(result) => { + let result = result.into_iter().collect::(); + assert!(result == decoded, + format!("Incorrect decoding of \"{}\":\n \"{}\"\n!= \"{}\"\n", + encoded, result, decoded)) + } + } + + match encode_str(decoded) { + None => panic!("Encoding {} failed.", decoded), + Some(result) => { + assert!(result == encoded, + format!("Incorrect encoding of \"{}\":\n \"{}\"\n!= \"{}\"\n", + decoded, result, encoded)) + } + } +} + +fn get_string<'a>(map: &'a Object, key: &str) -> &'a str { + match map.get(&key.to_string()) { + Some(&Json::String(ref s)) => s, + None => "", + _ => panic!(), + } +} + +pub fn collect_tests(add_test: &mut F) { + match Json::from_str(include_str!("punycode_tests.json")) { + Ok(Json::Array(tests)) => for (i, test) in tests.into_iter().enumerate() { + match test { + Json::Object(o) => { + let test_name = { + let desc = get_string(&o, "description"); + if desc.is_empty() { + format!("Punycode {}", i + 1) + } else { + format!("Punycode {}: {}", i + 1, desc) + } + }; + add_test(test_name, TestFn::dyn_test_fn(move || one_test( + get_string(&o, "decoded"), + get_string(&o, "encoded"), + ))) + } + _ => panic!(), + } + }, + other => panic!("{:?}", other) + } +} diff --git a/tests/punycode_tests.json b/idna/tests/punycode_tests.json similarity index 100% rename from tests/punycode_tests.json rename to idna/tests/punycode_tests.json diff --git a/idna/tests/tests.rs b/idna/tests/tests.rs new file mode 100644 index 00000000..0a4ad03e --- /dev/null +++ b/idna/tests/tests.rs @@ -0,0 +1,25 @@ +extern crate idna; +extern crate rustc_serialize; +extern crate test; + +mod punycode; +mod uts46; + +fn main() { + let mut tests = Vec::new(); + { + let mut add_test = |name, run| { + tests.push(test::TestDescAndFn { + desc: test::TestDesc { + name: test::DynTestName(name), + ignore: false, + should_panic: test::ShouldPanic::No, + }, + testfn: run, + }) + }; + punycode::collect_tests(&mut add_test); + uts46::collect_tests(&mut add_test); + } + test::test_main(&std::env::args().collect::>(), tests) +} diff --git a/idna/tests/uts46.rs b/idna/tests/uts46.rs new file mode 100644 index 00000000..f660559f --- /dev/null +++ b/idna/tests/uts46.rs @@ -0,0 +1,117 @@ +// Copyright 2013-2014 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::char; +use idna::uts46; +use test::TestFn; + +pub fn collect_tests(add_test: &mut F) { + // http://www.unicode.org/Public/idna/latest/IdnaTest.txt + for (i, line) in include_str!("IdnaTest.txt").lines().enumerate() { + if line == "" || line.starts_with("#") { + continue + } + // Remove comments + let mut line = match line.find("#") { + Some(index) => &line[0..index], + None => line + }; + + let mut expected_failure = false; + if line.starts_with("XFAIL") { + expected_failure = true; + line = &line[5..line.len()]; + }; + + let mut pieces = line.split(';').map(|x| x.trim()).collect::>(); + + let test_type = pieces.remove(0); + let original = pieces.remove(0); + let source = unescape(original); + let to_unicode = pieces.remove(0); + let to_ascii = pieces.remove(0); + let nv8 = if pieces.len() > 0 { pieces.remove(0) } else { "" }; + + if expected_failure { + continue; + } + + let test_name = format!("UTS #46 line {}", i + 1); + add_test(test_name, TestFn::dyn_test_fn(move || { + let result = uts46::to_ascii(&source, uts46::Flags { + use_std3_ascii_rules: true, + transitional_processing: test_type == "T", + verify_dns_length: true, + }); + + if to_ascii.starts_with("[") { + if to_ascii.starts_with("[C") { + // http://unicode.org/reports/tr46/#Deviations + // applications that perform IDNA2008 lookup are not required to check + // for these contexts + return; + } + let res = result.ok(); + assert!(res == None, "Expected error. result: {} | original: {} | source: {}", + res.unwrap(), original, source); + return; + } + + let to_ascii = if to_ascii.len() > 0 { + to_ascii.to_string() + } else { + if to_unicode.len() > 0 { + to_unicode.to_string() + } else { + source.clone() + } + }; + + if nv8 == "NV8" { + // This result isn't valid under IDNA2008. Skip it + return; + } + + assert!(result.is_ok(), "Couldn't parse {} | original: {} | error: {:?}", + source, original, result.err()); + let output = result.ok().unwrap(); + assert!(output == to_ascii, "result: {} | expected: {} | original: {} | source: {}", + output, to_ascii, original, source); + })) + } +} + +fn unescape(input: &str) -> String { + let mut output = String::new(); + let mut chars = input.chars(); + loop { + match chars.next() { + None => return output, + Some(c) => + if c == '\\' { + match chars.next().unwrap() { + '\\' => output.push('\\'), + 'u' => { + let c1 = chars.next().unwrap().to_digit(16).unwrap(); + let c2 = chars.next().unwrap().to_digit(16).unwrap(); + let c3 = chars.next().unwrap().to_digit(16).unwrap(); + let c4 = chars.next().unwrap().to_digit(16).unwrap(); + match char::from_u32((((c1 * 16 + c2) * 16 + c3) * 16 + c4)) + { + Some(c) => output.push(c), + None => { output.push_str(&format!("\\u{:X}{:X}{:X}{:X}",c1,c2,c3,c4)); } + }; + } + _ => panic!("Invalid test data input"), + } + } else { + output.push(c); + } + } + } +} diff --git a/make_encode_sets.py b/make_encode_sets.py deleted file mode 100644 index eb859050..00000000 --- a/make_encode_sets.py +++ /dev/null @@ -1,42 +0,0 @@ -# Copyright 2013-2014 Simon Sapin. -# -# Licensed under the Apache License, Version 2.0 or the MIT license -# , at your -# option. This file may not be copied, modified, or distributed -# except according to those terms. - - -# Run as: python make_encode_sets.py > src/encode_sets.rs - - -print('''\ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// Generated by make_encode_sets.py -''') -for name, encoded in [ - ('SIMPLE', ''), - ('QUERY', r''' "#<>'''), - ('DEFAULT', r''' "#<>`?{}'''), - ('USERINFO', r''' "#<>`?{}@'''), - ('PASSWORD', r''' "#<>`?{}@\/'''), - ('USERNAME', r''' "#<>`?{}@\/:'''), - ('FORM_URLENCODED', r''' !"#$%&\'()+,/:;<=>?@[\]^`{|}~'''), - ('HTTP_VALUE', r''' "%'()*,/:;<->?[\]{}'''), -]: - print( - "pub static %s: [&'static str; 256] = [\n%s\n];\n\n" - % (name, '\n'.join( - ' ' + ' '.join( - '"%s%s",' % ("\\" if chr(b) in '\\"' else "", chr(b)) - if 0x20 <= b <= 0x7E and chr(b) not in encoded - else '"%%%02X",' % b - for b in range(s, s + 8) - ) for s in range(0, 256, 8)))) diff --git a/src/encode_sets.rs b/src/encode_sets.rs deleted file mode 100644 index d7b5fb9d..00000000 --- a/src/encode_sets.rs +++ /dev/null @@ -1,298 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -// Generated by make_encode_sets.py - -pub static SIMPLE: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - " ", "!", "\"", "#", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "<", "=", ">", "?", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static QUERY: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "?", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "{", "|", "}", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static DEFAULT: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static USERINFO: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "/", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "\\", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static PASSWORD: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", ":", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "%5C", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static USERNAME: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "%23", "$", "%", "&", "'", - "(", ")", "*", "+", ",", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", ";", "%3C", "=", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "[", "%5C", "]", "^", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static FORM_URLENCODED: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "%21", "%22", "%23", "%24", "%25", "%26", "%27", - "%28", "%29", "*", "%2B", "%2C", "-", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", "%3B", "%3C", "%3D", "%3E", "%3F", - "%40", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "%5B", "%5C", "%5D", "%5E", "_", - "%60", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "%7C", "%7D", "%7E", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - -pub static HTTP_VALUE: [&'static str; 256] = [ - "%00", "%01", "%02", "%03", "%04", "%05", "%06", "%07", - "%08", "%09", "%0A", "%0B", "%0C", "%0D", "%0E", "%0F", - "%10", "%11", "%12", "%13", "%14", "%15", "%16", "%17", - "%18", "%19", "%1A", "%1B", "%1C", "%1D", "%1E", "%1F", - "%20", "!", "%22", "#", "$", "%25", "&", "%27", - "%28", "%29", "%2A", "+", "%2C", "%2D", ".", "%2F", - "0", "1", "2", "3", "4", "5", "6", "7", - "8", "9", "%3A", "%3B", "%3C", "=", "%3E", "%3F", - "@", "A", "B", "C", "D", "E", "F", "G", - "H", "I", "J", "K", "L", "M", "N", "O", - "P", "Q", "R", "S", "T", "U", "V", "W", - "X", "Y", "Z", "%5B", "%5C", "%5D", "^", "_", - "`", "a", "b", "c", "d", "e", "f", "g", - "h", "i", "j", "k", "l", "m", "n", "o", - "p", "q", "r", "s", "t", "u", "v", "w", - "x", "y", "z", "%7B", "|", "%7D", "~", "%7F", - "%80", "%81", "%82", "%83", "%84", "%85", "%86", "%87", - "%88", "%89", "%8A", "%8B", "%8C", "%8D", "%8E", "%8F", - "%90", "%91", "%92", "%93", "%94", "%95", "%96", "%97", - "%98", "%99", "%9A", "%9B", "%9C", "%9D", "%9E", "%9F", - "%A0", "%A1", "%A2", "%A3", "%A4", "%A5", "%A6", "%A7", - "%A8", "%A9", "%AA", "%AB", "%AC", "%AD", "%AE", "%AF", - "%B0", "%B1", "%B2", "%B3", "%B4", "%B5", "%B6", "%B7", - "%B8", "%B9", "%BA", "%BB", "%BC", "%BD", "%BE", "%BF", - "%C0", "%C1", "%C2", "%C3", "%C4", "%C5", "%C6", "%C7", - "%C8", "%C9", "%CA", "%CB", "%CC", "%CD", "%CE", "%CF", - "%D0", "%D1", "%D2", "%D3", "%D4", "%D5", "%D6", "%D7", - "%D8", "%D9", "%DA", "%DB", "%DC", "%DD", "%DE", "%DF", - "%E0", "%E1", "%E2", "%E3", "%E4", "%E5", "%E6", "%E7", - "%E8", "%E9", "%EA", "%EB", "%EC", "%ED", "%EE", "%EF", - "%F0", "%F1", "%F2", "%F3", "%F4", "%F5", "%F6", "%F7", - "%F8", "%F9", "%FA", "%FB", "%FC", "%FD", "%FE", "%FF", -]; - - diff --git a/src/encoding.rs b/src/encoding.rs index 5cdd71d3..0703c788 100644 --- a/src/encoding.rs +++ b/src/encoding.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2014 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -27,43 +27,64 @@ pub struct EncodingOverride { #[cfg(feature = "query_encoding")] impl EncodingOverride { - pub fn from_opt_encoding(encoding: Option) -> EncodingOverride { - encoding.map(EncodingOverride::from_encoding).unwrap_or_else(EncodingOverride::utf8) + pub fn from_opt_encoding(encoding: Option) -> Self { + encoding.map(Self::from_encoding).unwrap_or_else(Self::utf8) } - pub fn from_encoding(encoding: EncodingRef) -> EncodingOverride { + pub fn from_encoding(encoding: EncodingRef) -> Self { EncodingOverride { encoding: if encoding.name() == "utf-8" { None } else { Some(encoding) } } } - pub fn utf8() -> EncodingOverride { + #[inline] + pub fn utf8() -> Self { EncodingOverride { encoding: None } } - pub fn lookup(label: &[u8]) -> Option { + pub fn lookup(label: &[u8]) -> Option { + // Don't use String::from_utf8_lossy since no encoding label contains U+FFFD + // https://encoding.spec.whatwg.org/#names-and-labels ::std::str::from_utf8(label) .ok() .and_then(encoding_from_whatwg_label) - .map(EncodingOverride::from_encoding) + .map(Self::from_encoding) + } + + /// https://encoding.spec.whatwg.org/#get-an-output-encoding + pub fn to_output_encoding(self) -> Self { + if let Some(encoding) = self.encoding { + if matches!(encoding.name(), "utf-16le" | "utf-16be") { + return Self::utf8() + } + } + self } pub fn is_utf8(&self) -> bool { self.encoding.is_none() } - pub fn decode(&self, input: &[u8]) -> String { + pub fn name(&self) -> &'static str { match self.encoding { - Some(encoding) => encoding.decode(input, DecoderTrap::Replace).unwrap(), - None => String::from_utf8_lossy(input).to_string(), + Some(encoding) => encoding.name(), + None => "utf-8", } } - pub fn encode<'a>(&self, input: &'a str) -> Cow<'a, [u8]> { + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { match self.encoding { - Some(encoding) => Cow::Owned( - encoding.encode(input, EncoderTrap::NcrEscape).unwrap()), - None => Cow::Borrowed(input.as_bytes()), // UTF-8 + // `encoding.decode` never returns `Err` when called with `DecoderTrap::Replace` + Some(encoding) => encoding.decode(&input, DecoderTrap::Replace).unwrap().into(), + None => decode_utf8_lossy(input), + } + } + + pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + match self.encoding { + // `encoding.encode` never returns `Err` when called with `EncoderTrap::NcrEscape` + Some(encoding) => Cow::Owned(encoding.encode(&input, EncoderTrap::NcrEscape).unwrap()), + None => encode_utf8(input) } } } @@ -75,23 +96,40 @@ pub struct EncodingOverride; #[cfg(not(feature = "query_encoding"))] impl EncodingOverride { - pub fn utf8() -> EncodingOverride { + #[inline] + pub fn utf8() -> Self { EncodingOverride } - pub fn lookup(_label: &[u8]) -> Option { - None + pub fn decode<'a>(&self, input: Cow<'a, [u8]>) -> Cow<'a, str> { + decode_utf8_lossy(input) } - pub fn is_utf8(&self) -> bool { - true + pub fn encode<'a>(&self, input: Cow<'a, str>) -> Cow<'a, [u8]> { + encode_utf8(input) } +} - pub fn decode(&self, input: &[u8]) -> String { - String::from_utf8_lossy(input).into_owned() +pub fn decode_utf8_lossy(input: Cow<[u8]>) -> Cow { + match input { + Cow::Borrowed(bytes) => String::from_utf8_lossy(bytes), + Cow::Owned(bytes) => { + let raw_utf8: *const [u8]; + match String::from_utf8_lossy(&bytes) { + Cow::Borrowed(utf8) => raw_utf8 = utf8.as_bytes(), + Cow::Owned(s) => return s.into(), + } + // from_utf8_lossy returned a borrow of `bytes` unchanged. + debug_assert!(raw_utf8 == &*bytes as *const [u8]); + // Reuse the existing `Vec` allocation. + unsafe { String::from_utf8_unchecked(bytes) }.into() + } } +} - pub fn encode<'a>(&self, input: &'a str) -> Cow<'a, [u8]> { - Cow::Borrowed(input.as_bytes()) +pub fn encode_utf8(input: Cow) -> Cow<[u8]> { + match input { + Cow::Borrowed(s) => Cow::Borrowed(s.as_bytes()), + Cow::Owned(s) => Cow::Owned(s.into_bytes()) } } diff --git a/src/form_urlencoded.rs b/src/form_urlencoded.rs index 9af1cc34..16fbeb51 100644 --- a/src/form_urlencoded.rs +++ b/src/form_urlencoded.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2015 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,34 +6,37 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -//! Parser and serializer for the [`application/x-www-form-urlencoded` format]( +//! Parser and serializer for the [`application/x-www-form-urlencoded` syntax]( //! http://url.spec.whatwg.org/#application/x-www-form-urlencoded), //! as used by HTML forms. //! //! Converts between a string (such as an URL’s query string) //! and a sequence of (name, value) pairs. -use std::borrow::Borrow; -use std::ascii::AsciiExt; use encoding::EncodingOverride; -use percent_encoding::{percent_encode_to, percent_decode, FORM_URLENCODED_ENCODE_SET}; +use percent_encoding::{percent_encode_byte, percent_decode}; +use std::borrow::{Borrow, Cow}; +use std::str; -/// Convert a byte string in the `application/x-www-form-urlencoded` format -/// into a vector of (name, value) pairs. +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax +/// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. /// -/// The names and values are URL-decoded. For instance, `%23first=%25try%25` will be +/// The names and values are percent-decoded. For instance, `%23first=%25try%25` will be /// converted to `[("#first", "%try%")]`. #[inline] -pub fn parse(input: &[u8]) -> Vec<(String, String)> { - parse_internal(input, EncodingOverride::utf8(), false).unwrap() +pub fn parse(input: &[u8]) -> Parse { + Parse { + input: input, + encoding: EncodingOverride::utf8(), + } } -/// Convert a byte string in the `application/x-www-form-urlencoded` format -/// into a vector of (name, value) pairs. +/// Convert a byte string in the `application/x-www-form-urlencoded` syntax +/// into a iterator of (name, value) pairs. /// /// Use `parse(input.as_bytes())` to parse a `&str` string. /// @@ -45,100 +48,317 @@ pub fn parse(input: &[u8]) -> Vec<(String, String)> { /// after percent-decoding. Defaults to UTF-8. /// * `use_charset`: The *use _charset_ flag*. If in doubt, set to `false`. #[cfg(feature = "query_encoding")] -#[inline] -pub fn parse_with_encoding(input: &[u8], encoding_override: Option<::encoding::EncodingRef>, - use_charset: bool) - -> Option> { - parse_internal(input, EncodingOverride::from_opt_encoding(encoding_override), use_charset) +pub fn parse_with_encoding<'a>(input: &'a [u8], + encoding_override: Option<::encoding::EncodingRef>, + use_charset: bool) + -> Result, ()> { + use std::ascii::AsciiExt; + + let mut encoding = EncodingOverride::from_opt_encoding(encoding_override); + if !(encoding.is_utf8() || input.is_ascii()) { + return Err(()) + } + if use_charset { + for sequence in input.split(|&b| b == b'&') { + // No '+' in "_charset_" to replace with ' '. + if sequence.starts_with(b"_charset_=") { + let value = &sequence[b"_charset_=".len()..]; + // Skip replacing '+' with ' ' in value since no encoding label contains either: + // https://encoding.spec.whatwg.org/#names-and-labels + if let Some(e) = EncodingOverride::lookup(value) { + encoding = e; + break + } + } + } + } + Ok(Parse { + input: input, + encoding: encoding, + }) } +/// The return type of `parse()`. +#[derive(Copy, Clone)] +pub struct Parse<'a> { + input: &'a [u8], + encoding: EncodingOverride, +} -fn parse_internal(input: &[u8], mut encoding_override: EncodingOverride, mut use_charset: bool) - -> Option> { - let mut pairs = Vec::new(); - for piece in input.split(|&b| b == b'&') { - if !piece.is_empty() { - let (name, value) = match piece.iter().position(|b| *b == b'=') { - Some(position) => (&piece[..position], &piece[position + 1..]), - None => (piece, &[][..]) - }; +impl<'a> Iterator for Parse<'a> { + type Item = (Cow<'a, str>, Cow<'a, str>); - #[inline] - fn replace_plus(input: &[u8]) -> Vec { - input.iter().map(|&b| if b == b'+' { b' ' } else { b }).collect() + fn next(&mut self) -> Option { + loop { + if self.input.is_empty() { + return None } + let mut split2 = self.input.splitn(2, |&b| b == b'&'); + let sequence = split2.next().unwrap(); + self.input = split2.next().unwrap_or(&[][..]); + if sequence.is_empty() { + continue + } + let mut split2 = sequence.splitn(2, |&b| b == b'='); + let name = split2.next().unwrap(); + let value = split2.next().unwrap_or(&[][..]); + return Some(( + decode(name, self.encoding), + decode(value, self.encoding), + )) + } + } +} - let name = replace_plus(name); - let value = replace_plus(value); - if use_charset && name == b"_charset_" { - if let Some(encoding) = EncodingOverride::lookup(&value) { - encoding_override = encoding; +fn decode(input: &[u8], encoding: EncodingOverride) -> Cow { + let replaced = replace_plus(input); + encoding.decode(match percent_decode(&replaced).if_any() { + Some(vec) => Cow::Owned(vec), + None => replaced, + }) +} + +/// Replace b'+' with b' ' +fn replace_plus<'a>(input: &'a [u8]) -> Cow<'a, [u8]> { + match input.iter().position(|&b| b == b'+') { + None => Cow::Borrowed(input), + Some(first_position) => { + let mut replaced = input.to_owned(); + replaced[first_position] = b' '; + for byte in &mut replaced[first_position + 1..] { + if *byte == b'+' { + *byte = b' '; } - use_charset = false; } - pairs.push((name, value)); + Cow::Owned(replaced) } } - if !(encoding_override.is_utf8() || input.is_ascii()) { - return None +} + +impl<'a> Parse<'a> { + /// Return a new iterator that yields pairs of `String` instead of pairs of `Cow`. + pub fn into_owned(self) -> ParseIntoOwned<'a> { + ParseIntoOwned { inner: self } } +} - Some(pairs.into_iter().map(|(name, value)| ( - encoding_override.decode(&percent_decode(&name)), - encoding_override.decode(&percent_decode(&value)) - )).collect()) +/// Like `Parse`, but yields pairs of `String` instead of pairs of `Cow`. +pub struct ParseIntoOwned<'a> { + inner: Parse<'a> } +impl<'a> Iterator for ParseIntoOwned<'a> { + type Item = (String, String); -/// Convert an iterator of (name, value) pairs -/// into a string in the `application/x-www-form-urlencoded` format. -#[inline] -pub fn serialize(pairs: I) -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - serialize_internal(pairs, EncodingOverride::utf8()) + fn next(&mut self) -> Option { + self.inner.next().map(|(k, v)| (k.into_owned(), v.into_owned())) + } } -/// Convert an iterator of (name, value) pairs -/// into a string in the `application/x-www-form-urlencoded` format. +/// The [`application/x-www-form-urlencoded` byte serializer]( +/// https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer). /// -/// This function is only available if the `query_encoding` Cargo feature is enabled. -/// -/// Arguments: -/// -/// * `encoding_override`: The character encoding each name and values is encoded as -/// before percent-encoding. Defaults to UTF-8. -#[cfg(feature = "query_encoding")] -#[inline] -pub fn serialize_with_encoding(pairs: I, - encoding_override: Option<::encoding::EncodingRef>) - -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - serialize_internal(pairs, EncodingOverride::from_opt_encoding(encoding_override)) -} - -fn serialize_internal(pairs: I, encoding_override: EncodingOverride) -> String -where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - #[inline] - fn byte_serialize(input: &str, output: &mut String, - encoding_override: EncodingOverride) { - for &byte in encoding_override.encode(input).iter() { - if byte == b' ' { - output.push_str("+") - } else { - percent_encode_to(&[byte], FORM_URLENCODED_ENCODE_SET, output) +/// Return an iterator of `&str` slices. +pub fn byte_serialize(input: &[u8]) -> ByteSerialize { + ByteSerialize { + bytes: input, + } +} + +/// Return value of `byte_serialize()`. +pub struct ByteSerialize<'a> { + bytes: &'a [u8], +} + +fn byte_serialized_unchanged(byte: u8) -> bool { + matches!(byte, b'*' | b'-' | b'.' | b'0' ... b'9' | b'A' ... b'Z' | b'_' | b'a' ... b'z') +} + +impl<'a> Iterator for ByteSerialize<'a> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if let Some((&first, tail)) = self.bytes.split_first() { + if !byte_serialized_unchanged(first) { + self.bytes = tail; + return Some(if first == b' ' { "+" } else { percent_encode_byte(first) }) + } + let position = tail.iter().position(|&b| !byte_serialized_unchanged(b)); + let (unchanged_slice, remaining) = match position { + // 1 for first_byte + i unchanged in tail + Some(i) => self.bytes.split_at(1 + i), + None => (self.bytes, &[][..]), + }; + self.bytes = remaining; + Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) + } else { + None + } + } + + fn size_hint(&self) -> (usize, Option) { + if self.bytes.is_empty() { + (0, Some(0)) + } else { + (1, Some(self.bytes.len())) + } + } +} + +/// The [`application/x-www-form-urlencoded` serializer]( +/// https://url.spec.whatwg.org/#concept-urlencoded-serializer). +pub struct Serializer { + target: Option, + start_position: usize, + encoding: EncodingOverride, +} + +pub trait Target { + fn as_mut_string(&mut self) -> &mut String; + fn finish(self) -> Self::Finished; + type Finished; +} + +impl Target for String { + fn as_mut_string(&mut self) -> &mut String { self } + fn finish(self) -> Self { self } + type Finished = Self; +} + +impl<'a> Target for &'a mut String { + fn as_mut_string(&mut self) -> &mut String { &mut **self } + fn finish(self) -> Self { self } + type Finished = Self; +} + +// `as_mut_string` string here exposes the internal serialization of an `Url`, +// which should not be exposed to users. +// We achieve that by not giving users direct access to `UrlQuery`: +// * Its fields are private +// (and so can not be constructed with struct literal syntax outside of this crate), +// * It has no constructor +// * It is only visible (on the type level) to users in the return type of +// `Url::mutate_query_pairs` which is `Serializer` +// * `Serializer` keeps its target in a private field +// * Unlike in other `Target` impls, `UrlQuery::finished` does not return `Self`. +impl<'a> Target for ::UrlQuery<'a> { + fn as_mut_string(&mut self) -> &mut String { &mut self.url.serialization } + fn finish(self) -> &'a mut ::Url { self.url } + type Finished = &'a mut ::Url; +} + +impl Serializer { + /// Create a new `application/x-www-form-urlencoded` serializer for the given target. + /// + /// If the target is non-empty, + /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax. + pub fn new(target: T) -> Self { + Self::for_suffix(target, 0) + } + + /// Create a new `application/x-www-form-urlencoded` serializer + /// for a suffix of the given target. + /// + /// If that suffix is non-empty, + /// its content is assumed to already be in `application/x-www-form-urlencoded` syntax. + pub fn for_suffix(mut target: T, start_position: usize) -> Self { + &target.as_mut_string()[start_position..]; // Panic if out of bounds + Serializer { + target: Some(target), + start_position: start_position, + encoding: EncodingOverride::utf8(), + } + } + + /// Remove any existing name/value pair. + /// + /// Panics if called after `.finish()`. + pub fn clear(&mut self) -> &mut Self { + string(&mut self.target).truncate(self.start_position); + self + } + + /// Set the character encoding to be used for names and values before percent-encoding. + #[cfg(feature = "query_encoding")] + pub fn encoding_override(&mut self, new: Option<::encoding::EncodingRef>) -> &mut Self { + self.encoding = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self + } + + /// Serialize and append a name/value pair. + /// + /// Panics if called after `.finish()`. + pub fn append_pair(&mut self, name: &str, value: &str) -> &mut Self { + append_pair(string(&mut self.target), self.start_position, self.encoding, name, value); + self + } + + /// Serialize and append a number of name/value pairs. + /// + /// This simply calls `append_pair` repeatedly. + /// This can be more convenient, so the user doesn’t need to introduce a block + /// to limit the scope of `Serializer`’s borrow of its string. + /// + /// Panics if called after `.finish()`. + pub fn extend_pairs(&mut self, iter: I) -> &mut Self + where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { + { + let string = string(&mut self.target); + for pair in iter { + let &(ref k, ref v) = pair.borrow(); + append_pair(string, self.start_position, self.encoding, k.as_ref(), v.as_ref()); } } + self } - let mut output = String::new(); - for pair in pairs { - let &(ref name, ref value) = pair.borrow(); - if !output.is_empty() { - output.push_str("&"); + /// Add a name/value pair whose name is `_charset_` + /// and whose value is the character encoding’s name. + /// (See the `encoding_override()` method.) + /// + /// Panics if called after `.finish()`. + #[cfg(feature = "query_encoding")] + pub fn append_charset(&mut self) -> &mut Self { + { + let string = string(&mut self.target); + append_separator_if_needed(string, self.start_position); + string.push_str("_charset_="); + string.push_str(self.encoding.name()); } - byte_serialize(name.as_ref(), &mut output, encoding_override); - output.push_str("="); - byte_serialize(value.as_ref(), &mut output, encoding_override); + self } - output + + /// If this serializer was constructed with a string, take and return that string. + /// + /// ```rust + /// use url::form_urlencoded; + /// let encoded: String = form_urlencoded::Serializer::new(String::new()) + /// .append_pair("foo", "bar & baz") + /// .append_pair("saison", "Été+hiver") + /// .finish(); + /// assert_eq!(encoded, "foo=bar+%26+baz&saison=%C3%89t%C3%A9%2Bhiver"); + /// ``` + /// + /// Panics if called more than once. + pub fn finish(&mut self) -> T::Finished { + self.target.take().expect("url::form_urlencoded::Serializer double finish").finish() + } +} + +fn append_separator_if_needed(string: &mut String, start_position: usize) { + if string.len() > start_position { + string.push('&') + } +} + +fn string(target: &mut Option) -> &mut String { + target.as_mut().expect("url::form_urlencoded::Serializer finished").as_mut_string() +} + +fn append_pair(string: &mut String, start_position: usize, encoding: EncodingOverride, + name: &str, value: &str) { + append_separator_if_needed(string, start_position); + string.extend(byte_serialize(&encoding.encode(name.into()))); + string.push('='); + string.extend(byte_serialize(&encoding.encode(value.into()))); } diff --git a/src/format.rs b/src/format.rs deleted file mode 100644 index ad656056..00000000 --- a/src/format.rs +++ /dev/null @@ -1,81 +0,0 @@ -// Copyright 2013-2015 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Formatting utilities for URLs. -//! -//! These formatters can be used to coerce various URL parts into strings. -//! -//! You can use `.to_string()`, as the formatters implement `fmt::Display`. - -use std::fmt::{self, Formatter}; -use super::Url; - -/// Formatter and serializer for URL path data. -pub struct PathFormatter<'a, T:'a> { - /// The path as a slice of string-like objects (String or &str). - pub path: &'a [T] -} - -impl<'a, T: fmt::Display> fmt::Display for PathFormatter<'a, T> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - if self.path.is_empty() { - formatter.write_str("/") - } else { - for path_part in self.path { - try!("/".fmt(formatter)); - try!(path_part.fmt(formatter)); - } - Ok(()) - } - } -} - - -/// Formatter and serializer for URL username and password data. -pub struct UserInfoFormatter<'a> { - /// URL username as a string slice. - pub username: &'a str, - - /// URL password as an optional string slice. - /// - /// You can convert an `Option` with `.as_ref().map(|s| s)`. - pub password: Option<&'a str> -} - -impl<'a> fmt::Display for UserInfoFormatter<'a> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - if !self.username.is_empty() || self.password.is_some() { - try!(formatter.write_str(self.username)); - if let Some(password) = self.password { - try!(formatter.write_str(":")); - try!(formatter.write_str(password)); - } - try!(formatter.write_str("@")); - } - Ok(()) - } -} - - -/// Formatter for URLs which ignores the fragment field. -pub struct UrlNoFragmentFormatter<'a> { - pub url: &'a Url -} - -impl<'a> fmt::Display for UrlNoFragmentFormatter<'a> { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - try!(formatter.write_str(&self.url.scheme)); - try!(formatter.write_str(":")); - try!(self.url.scheme_data.fmt(formatter)); - if let Some(ref query) = self.url.query { - try!(formatter.write_str("?")); - try!(formatter.write_str(query)); - } - Ok(()) - } -} diff --git a/src/host.rs b/src/host.rs index 06ac7818..47ebbd79 100644 --- a/src/host.rs +++ b/src/host.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,78 +6,96 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. -use std::ascii::AsciiExt; use std::cmp; -use std::fmt::{self, Formatter}; -use std::net::{Ipv4Addr, Ipv6Addr}; +use std::fmt::{self, Formatter, Write}; +use std::io; +use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr, SocketAddrV4, SocketAddrV6, ToSocketAddrs}; +use std::vec; use parser::{ParseResult, ParseError}; -use percent_encoding::{from_hex, percent_decode}; +use percent_encoding::percent_decode; use idna; +#[derive(Copy, Clone, Debug, Eq, PartialEq)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +pub enum HostInternal { + None, + Domain, + Ipv4(Ipv4Addr), + Ipv6(Ipv6Addr), +} + +impl From> for HostInternal { + fn from(host: Host) -> HostInternal { + match host { + Host::Domain(_) => HostInternal::Domain, + Host::Ipv4(address) => HostInternal::Ipv4(address), + Host::Ipv6(address) => HostInternal::Ipv6(address), + } + } +} /// The host name of an URL. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] +#[derive(Clone, Debug, Eq, PartialEq, Ord, PartialOrd, Hash)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum Host { - /// A (DNS) domain name. - Domain(String), - /// A IPv4 address, represented by four sequences of up to three ASCII digits. +pub enum Host { + /// A DNS domain name, as '.' dot-separated labels. + /// Non-ASCII labels are encoded in punycode per IDNA. + Domain(S), + + /// An IPv4 address. + /// `Url::host_str` returns the serialization of this address, + /// as four decimal integers separated by `.` dots. Ipv4(Ipv4Addr), - /// An IPv6 address, represented inside `[...]` square brackets - /// so that `:` colon characters in the address are not ambiguous - /// with the port number delimiter. + + /// An IPv6 address. + /// `Url::host_str` returns the serialization of that address between `[` and `]` brackets, + /// in the format per [RFC 5952 *A Recommendation + /// for IPv6 Address Text Representation*](https://tools.ietf.org/html/rfc5952): + /// lowercase hexadecimal with maximal `::` compression. Ipv6(Ipv6Addr), } +impl<'a> Host<&'a str> { + /// Return a copy of `self` that owns an allocated `String` but does not borrow an `&Url`. + pub fn to_owned(&self) -> Host { + match *self { + Host::Domain(domain) => Host::Domain(domain.to_owned()), + Host::Ipv4(address) => Host::Ipv4(address), + Host::Ipv6(address) => Host::Ipv6(address), + } + } +} -impl Host { +impl Host { /// Parse a host: either an IPv6 address in [] square brackets, or a domain. /// - /// Returns `Err` for an empty host, an invalid IPv6 address, - /// or a or invalid non-ASCII domain. - pub fn parse(input: &str) -> ParseResult { - if input.len() == 0 { - return Err(ParseError::EmptyHost) - } + /// https://url.spec.whatwg.org/#host-parsing + pub fn parse(input: &str) -> Result { if input.starts_with("[") { if !input.ends_with("]") { return Err(ParseError::InvalidIpv6Address) } return parse_ipv6addr(&input[1..input.len() - 1]).map(Host::Ipv6) } - let decoded = percent_decode(input.as_bytes()); - let domain = String::from_utf8_lossy(&decoded); - - let domain = match idna::domain_to_ascii(&domain) { - Ok(s) => s, - Err(_) => return Err(ParseError::InvalidDomainCharacter) - }; - - if domain.find(&[ - '\0', '\t', '\n', '\r', ' ', '#', '%', '/', ':', '?', '@', '[', '\\', ']' - ][..]).is_some() { + let domain = percent_decode(input.as_bytes()).decode_utf8_lossy(); + let domain = try!(idna::domain_to_ascii(&domain)); + if domain.find(|c| matches!(c, + '\0' | '\t' | '\n' | '\r' | ' ' | '#' | '%' | '/' | ':' | '?' | '@' | '[' | '\\' | ']' + )).is_some() { return Err(ParseError::InvalidDomainCharacter) } - match parse_ipv4addr(&domain[..]) { - Ok(Some(ipv4addr)) => Ok(Host::Ipv4(ipv4addr)), - Ok(None) => Ok(Host::Domain(domain.to_ascii_lowercase())), - Err(e) => Err(e), + if let Some(address) = try!(parse_ipv4addr(&domain)) { + Ok(Host::Ipv4(address)) + } else { + Ok(Host::Domain(domain.into())) } } - - /// Serialize the host as a string. - /// - /// A domain a returned as-is, an IPv6 address between [] square brackets. - pub fn serialize(&self) -> String { - self.to_string() - } } - -impl fmt::Display for Host { +impl> fmt::Display for Host { fn fmt(&self, f: &mut Formatter) -> fmt::Result { match *self { - Host::Domain(ref domain) => domain.fmt(f), + Host::Domain(ref domain) => domain.as_ref().fmt(f), Host::Ipv4(ref addr) => addr.fmt(f), Host::Ipv6(ref addr) => { try!(f.write_str("[")); @@ -88,6 +106,68 @@ impl fmt::Display for Host { } } +/// This mostly exists because coherence rules don’t allow us to implement +/// `ToSocketAddrs for (Host, u16)`. +pub struct HostAndPort { + pub host: Host, + pub port: u16, +} + +impl<'a> HostAndPort<&'a str> { + /// Return a copy of `self` that owns an allocated `String` but does not borrow an `&Url`. + pub fn to_owned(&self) -> HostAndPort { + HostAndPort { + host: self.host.to_owned(), + port: self.port + } + } +} + +impl> ToSocketAddrs for HostAndPort { + type Iter = SocketAddrs; + + fn to_socket_addrs(&self) -> io::Result { + let port = self.port; + match self.host { + Host::Domain(ref domain) => Ok(SocketAddrs { + // FIXME: use std::net::lookup_host when it’s stable. + state: SocketAddrsState::Domain(try!((domain.as_ref(), port).to_socket_addrs())) + }), + Host::Ipv4(address) => Ok(SocketAddrs { + state: SocketAddrsState::One(SocketAddr::V4(SocketAddrV4::new(address, port))) + }), + Host::Ipv6(address) => Ok(SocketAddrs { + state: SocketAddrsState::One(SocketAddr::V6(SocketAddrV6::new(address, port, 0, 0))) + }), + } + } +} + +/// Socket addresses for an URL. +pub struct SocketAddrs { + state: SocketAddrsState +} + +enum SocketAddrsState { + Domain(vec::IntoIter), + One(SocketAddr), + Done, +} + +impl Iterator for SocketAddrs { + type Item = SocketAddr; + fn next(&mut self) -> Option { + match self.state { + SocketAddrsState::Domain(ref mut iter) => iter.next(), + SocketAddrsState::One(s) => { + self.state = SocketAddrsState::Done; + Some(s) + } + SocketAddrsState::Done => None + } + } +} + fn write_ipv6(addr: &Ipv6Addr, f: &mut Formatter) -> fmt::Result { let segments = addr.segments(); let (compress_start, compress_end) = longest_zero_sequence(&segments); @@ -143,7 +223,7 @@ fn longest_zero_sequence(pieces: &[u16; 8]) -> (isize, isize) { } -fn parse_ipv4number(mut input: &str) -> ParseResult { +fn parse_ipv4number(mut input: &str) -> Result { let mut r = 10; if input.starts_with("0x") || input.starts_with("0X") { input = &input[2..]; @@ -156,15 +236,18 @@ fn parse_ipv4number(mut input: &str) -> ParseResult { return Ok(0); } if input.starts_with("+") { - return Err(ParseError::InvalidIpv4Address) + return Err(()) } match u32::from_str_radix(&input, r) { Ok(number) => Ok(number), - Err(_) => Err(ParseError::InvalidIpv4Address), + Err(_) => Err(()), } } fn parse_ipv4addr(input: &str) -> ParseResult> { + if input.is_empty() { + return Ok(None) + } let mut parts: Vec<&str> = input.split('.').collect(); if parts.last() == Some(&"") { parts.pop(); @@ -237,7 +320,7 @@ fn parse_ipv6addr(input: &str) -> ParseResult { let end = cmp::min(len, start + 4); let mut value = 0u16; while i < end { - match from_hex(input[i]) { + match (input[i] as char).to_digit(16) { Some(digit) => { value = value * 0x10 + digit as u16; i += 1; diff --git a/src/lib.rs b/src/lib.rs index 9caffad0..5feba5b0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2015 Simon Sapin. +// Copyright 2013-2015 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -50,42 +50,42 @@ assert!(Url::parse("http://[:::1]") == Err(ParseError::InvalidIpv6Address)) Let’s parse a valid URL and look at its components. ``` -use url::{Url, SchemeData}; +use url::{Url, Host}; let issue_list_url = Url::parse( "https://github.com/rust-lang/rust/issues?labels=E-easy&state=open" ).unwrap(); -assert!(issue_list_url.scheme == "https".to_string()); -assert!(issue_list_url.domain() == Some("github.com")); +assert!(issue_list_url.scheme() == "https"); +assert!(issue_list_url.username() == ""); +assert!(issue_list_url.password() == None); +assert!(issue_list_url.host_str() == Some("github.com")); +assert!(issue_list_url.host() == Some(Host::Domain("github.com"))); assert!(issue_list_url.port() == None); -assert!(issue_list_url.path() == Some(&["rust-lang".to_string(), - "rust".to_string(), - "issues".to_string()][..])); -assert!(issue_list_url.query == Some("labels=E-easy&state=open".to_string())); -assert!(issue_list_url.fragment == None); -match issue_list_url.scheme_data { - SchemeData::Relative(..) => {}, // Expected - SchemeData::NonRelative(..) => panic!(), -} +assert!(issue_list_url.path() == "/rust-lang/rust/issues"); +assert!(issue_list_url.path_segments().map(|c| c.collect::>()) == + Some(vec!["rust-lang", "rust", "issues"])); +assert!(issue_list_url.query() == Some("labels=E-easy&state=open")); +assert!(issue_list_url.fragment() == None); +assert!(!issue_list_url.cannot_be_a_base()); ``` -The `scheme`, `query`, and `fragment` are directly fields of the `Url` struct: -they apply to all URLs. -Every other components has accessors because they only apply to URLs said to be -“in a relative scheme”. `https` is a relative scheme, but `data` is not: +Some URLs are said to be *cannot-be-a-base*: +they don’t have a username, password, host, or port, +and their "path" is an arbitrary string rather than slash-separated segments: ``` -use url::{Url, SchemeData}; +use url::Url; -let data_url = Url::parse("data:text/plain,Hello#").unwrap(); +let data_url = Url::parse("data:text/plain,Hello?World#").unwrap(); -assert!(data_url.scheme == "data".to_string()); -assert!(data_url.scheme_data == SchemeData::NonRelative("text/plain,Hello".to_string())); -assert!(data_url.non_relative_scheme_data() == Some("text/plain,Hello")); -assert!(data_url.query == None); -assert!(data_url.fragment == Some("".to_string())); +assert!(data_url.cannot_be_a_base()); +assert!(data_url.scheme() == "data"); +assert!(data_url.path() == "text/plain,Hello"); +assert!(data_url.path_segments().is_none()); +assert!(data_url.query() == Some("World")); +assert!(data_url.fragment() == Some("")); ``` @@ -97,7 +97,7 @@ Many contexts allow URL *references* that can be relative to a *base URL*: ``` -Since parsed URL are absolute, giving a base is required: +Since parsed URL are absolute, giving a base is required for parsing relative URLs: ``` use url::{Url, ParseError}; @@ -105,514 +105,972 @@ use url::{Url, ParseError}; assert!(Url::parse("../main.css") == Err(ParseError::RelativeUrlWithoutBase)) ``` -`UrlParser` is a method-chaining API to provide various optional parameters -to URL parsing, including a base URL. - -``` -use url::{Url, UrlParser}; - -let this_document = Url::parse("http://servo.github.io/rust-url/url/index.html").unwrap(); -let css_url = UrlParser::new().base_url(&this_document).parse("../main.css").unwrap(); -assert!(css_url.serialize() == "http://servo.github.io/rust-url/main.css".to_string()); -``` - -For convenience, the `join` method on `Url` is also provided to achieve the same result: +Use the `join` method on an `Url` to use it as a base URL: ``` use url::Url; let this_document = Url::parse("http://servo.github.io/rust-url/url/index.html").unwrap(); let css_url = this_document.join("../main.css").unwrap(); -assert!(&*css_url.serialize() == "http://servo.github.io/rust-url/main.css") +assert_eq!(css_url.as_str(), "http://servo.github.io/rust-url/main.css") */ #![cfg_attr(feature="heap_size", feature(plugin, custom_derive))] #![cfg_attr(feature="heap_size", plugin(heapsize_plugin))] -extern crate rustc_serialize; -extern crate uuid; - -#[macro_use] -extern crate matches; - -#[cfg(feature="serde_serialization")] -extern crate serde; - -#[cfg(feature="heap_size")] -#[macro_use] extern crate heapsize; +#[cfg(feature="rustc-serialize")] extern crate rustc_serialize; +#[macro_use] extern crate matches; +#[cfg(feature="serde")] extern crate serde; +#[cfg(feature="heap_size")] #[macro_use] extern crate heapsize; -extern crate unicode_normalization; -extern crate unicode_bidi; - -use std::fmt::{self, Formatter}; -use std::str; -use std::path::{Path, PathBuf}; -use std::borrow::Borrow; -use std::hash::{Hash, Hasher}; -use std::cmp::Ordering; +pub extern crate idna; -#[cfg(feature="serde_serialization")] -use std::str::FromStr; - -pub use host::Host; -pub use parser::{ErrorHandler, ParseResult, ParseError}; - -use percent_encoding::{percent_encode, lossy_utf8_percent_decode, DEFAULT_ENCODE_SET}; - -use format::{PathFormatter, UserInfoFormatter, UrlNoFragmentFormatter}; use encoding::EncodingOverride; +use host::HostInternal; +use parser::{Parser, Context, SchemeType, to_u32}; +use percent_encoding::{PATH_SEGMENT_ENCODE_SET, USERINFO_ENCODE_SET, + percent_encode, percent_decode, utf8_percent_encode}; +use std::cmp; +use std::fmt::{self, Write}; +use std::hash; +use std::io; +use std::mem; +use std::net::{ToSocketAddrs, IpAddr}; +use std::ops::{Range, RangeFrom, RangeTo}; +use std::path::{Path, PathBuf}; +use std::str; -use uuid::Uuid; +pub use origin::{Origin, OpaqueOrigin}; +pub use host::{Host, HostAndPort, SocketAddrs}; +pub use parser::ParseError; +pub use slicing::Position; mod encoding; mod host; +mod origin; mod parser; -pub mod urlutils; -pub mod percent_encoding; +mod slicing; + pub mod form_urlencoded; -pub mod punycode; -pub mod format; -pub mod idna; +pub mod percent_encoding; +pub mod quirks; -/// The parsed representation of an absolute URL. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] +/// A parsed URL record. +#[derive(Clone)] #[cfg_attr(feature="heap_size", derive(HeapSizeOf))] pub struct Url { - /// The scheme (a.k.a. protocol) of the URL, in ASCII lower case. - pub scheme: String, - - /// The components of the URL whose representation depends on where the scheme is *relative*. - pub scheme_data: SchemeData, - - /// The query string of the URL. + /// Syntax in pseudo-BNF: /// - /// `None` if the `?` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `query_pairs`, `set_query_from_pairs`, - /// and `lossy_percent_decode_query` methods. - pub query: Option, + /// url = scheme ":" [ hierarchical | non-hierarchical ] [ "?" query ]? [ "#" fragment ]? + /// non-hierarchical = non-hierarchical-path + /// non-hierarchical-path = /* Does not start with "/" */ + /// hierarchical = authority? hierarchical-path + /// authority = "//" userinfo? host [ ":" port ]? + /// userinfo = username [ ":" password ]? "@" + /// hierarchical-path = [ "/" path-segment ]+ + serialization: String, + + // Components + scheme_end: u32, // Before ':' + username_end: u32, // Before ':' (if a password is given) or '@' (if not) + host_start: u32, + host_end: u32, + host: HostInternal, + port: Option, + path_start: u32, // Before initial '/', if any + query_start: Option, // Before '?', unlike Position::QueryStart + fragment_start: Option, // Before '#', unlike Position::FragmentStart +} - /// The fragment identifier of the URL. - /// - /// `None` if the `#` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `lossy_percent_decode_fragment` method. - pub fragment: Option, +/// Full configuration for the URL parser. +#[derive(Copy, Clone)] +pub struct ParseOptions<'a> { + base_url: Option<&'a Url>, + encoding_override: encoding::EncodingOverride, + log_syntax_violation: Option<&'a Fn(&'static str)>, } -/// Opaque identifier for URLs that have file or other schemes -#[derive(PartialEq, Eq, Clone, Debug)] -pub struct OpaqueOrigin(Uuid); +impl<'a> ParseOptions<'a> { + /// Change the base URL + pub fn base_url(mut self, new: Option<&'a Url>) -> Self { + self.base_url = new; + self + } -#[cfg(feature="heap_size")] -known_heap_size!(0, OpaqueOrigin); + /// Override the character encoding of query strings. + /// This is a legacy concept only relevant for HTML. + #[cfg(feature = "query_encoding")] + pub fn encoding_override(mut self, new: Option) -> Self { + self.encoding_override = EncodingOverride::from_opt_encoding(new).to_output_encoding(); + self + } + + /// Call the provided function or closure on non-fatal parse errors. + pub fn log_syntax_violation(mut self, new: Option<&'a Fn(&'static str)>) -> Self { + self.log_syntax_violation = new; + self + } -impl OpaqueOrigin { - /// Creates a new opaque origin with a random UUID. - pub fn new() -> OpaqueOrigin { - OpaqueOrigin(Uuid::new_v4()) + /// Parse an URL string with the configuration so far. + pub fn parse(self, input: &str) -> Result { + Parser { + serialization: String::with_capacity(input.len()), + base_url: self.base_url, + query_encoding_override: self.encoding_override, + log_syntax_violation: self.log_syntax_violation, + context: Context::UrlParser, + }.parse_url(input) } } -/// The origin of the URL -#[derive(PartialEq, Eq, Clone, Debug)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum Origin { - /// A globally unique identifier - UID(OpaqueOrigin), +impl Url { + /// Parse an absolute URL from a string. + #[inline] + pub fn parse(input: &str) -> Result { + Url::options().parse(input) + } - /// Consists of the URL's scheme, host and port - Tuple(String, Host, u16) -} + /// Parse a string as an URL, with this URL as the base URL. + #[inline] + pub fn join(&self, input: &str) -> Result { + Url::options().base_url(Some(self)).parse(input) + } -/// The components of the URL whose representation depends on where the scheme is *relative*. -#[derive(PartialEq, Eq, Clone, Debug, Hash, PartialOrd, Ord)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub enum SchemeData { - /// Components for URLs in a *relative* scheme such as HTTP. - Relative(RelativeSchemeData), + /// Return a default `ParseOptions` that can fully configure the URL parser. + pub fn options<'a>() -> ParseOptions<'a> { + ParseOptions { + base_url: None, + encoding_override: EncodingOverride::utf8(), + log_syntax_violation: None, + } + } - /// No further structure is assumed for *non-relative* schemes such as `data` and `mailto`. + /// Return the serialization of this URL. /// - /// This is a single percent-encoded string, whose interpretation depends on the scheme. - /// - /// Percent encoded strings are within the ASCII range. - NonRelative(String), -} + /// This is fast since that serialization is already stored in the `Url` struct. + #[inline] + pub fn as_str(&self) -> &str { + &self.serialization + } -/// Components for URLs in a *relative* scheme such as HTTP. -#[derive(Clone, Debug)] -#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] -pub struct RelativeSchemeData { - /// The username of the URL, as a possibly empty, percent-encoded string. + /// Return the serialization of this URL. /// - /// Percent encoded strings are within the ASCII range. - /// - /// See also the `lossy_percent_decode_username` method. - pub username: String, + /// This consumes the `Url` and takes ownership of the `String` stored in it. + #[inline] + pub fn into_string(self) -> String { + self.serialization + } - /// The password of the URL. - /// - /// `None` if the `:` delimiter character was not part of the parsed input, - /// otherwise a possibly empty, percent-encoded string. - /// - /// Percent encoded strings are within the ASCII range. + /// For internal testing. /// - /// See also the `lossy_percent_decode_password` method. - pub password: Option, - - /// The host of the URL, either a domain name or an IPv4 address - pub host: Host, + /// Methods of the `Url` struct assume a number of invariants. + /// This checks each of these invariants and panic if one is not met. + /// This is for testing rust-url itself. + pub fn assert_invariants(&self) { + macro_rules! assert { + ($x: expr) => { + if !$x { + panic!("!( {} ) for URL {:?}", stringify!($x), self.serialization) + } + } + } - /// The port number of the URL. - /// `None` for file-like schemes, or to indicate the default port number. - pub port: Option, + macro_rules! assert_eq { + ($a: expr, $b: expr) => { + { + let a = $a; + let b = $b; + if a != b { + panic!("{:?} != {:?} ({} != {}) for URL {:?}", + a, b, stringify!($a), stringify!($b), self.serialization) + } + } + } + } - /// The default port number for the URL’s scheme. - /// `None` for file-like schemes. - pub default_port: Option, + assert!(self.scheme_end >= 1); + assert!(matches!(self.byte_at(0), b'a'...b'z' | b'A'...b'Z')); + assert!(self.slice(1..self.scheme_end).chars() + .all(|c| matches!(c, 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.'))); + assert_eq!(self.byte_at(self.scheme_end), b':'); + + if self.slice(self.scheme_end + 1 ..).starts_with("//") { + // URL with authority + match self.byte_at(self.username_end) { + b':' => { + assert!(self.host_start >= self.username_end + 2); + assert_eq!(self.byte_at(self.host_start - 1), b'@'); + } + b'@' => assert!(self.host_start == self.username_end + 1), + _ => assert_eq!(self.username_end, self.scheme_end + 3), + } + assert!(self.host_start >= self.username_end); + assert!(self.host_end >= self.host_start); + let host_str = self.slice(self.host_start..self.host_end); + match self.host { + HostInternal::None => assert_eq!(host_str, ""), + HostInternal::Ipv4(address) => assert_eq!(host_str, address.to_string()), + HostInternal::Ipv6(address) => assert_eq!(host_str, format!("[{}]", address)), + HostInternal::Domain => { + if SchemeType::from(self.scheme()).is_special() { + assert!(!host_str.is_empty()) + } + } + } + if self.path_start == self.host_end { + assert_eq!(self.port, None); + } else { + assert_eq!(self.byte_at(self.host_end), b':'); + let port_str = self.slice(self.host_end + 1..self.path_start); + assert_eq!(self.port, Some(port_str.parse::().unwrap())); + } + assert_eq!(self.byte_at(self.path_start), b'/'); + } else { + // Anarchist URL (no authority) + assert_eq!(self.username_end, self.scheme_end + 1); + assert_eq!(self.host_start, self.scheme_end + 1); + assert_eq!(self.host_end, self.scheme_end + 1); + assert_eq!(self.host, HostInternal::None); + assert_eq!(self.port, None); + assert_eq!(self.path_start, self.scheme_end + 1); + } + if let Some(start) = self.query_start { + assert!(start > self.path_start); + assert_eq!(self.byte_at(start), b'?'); + } + if let Some(start) = self.fragment_start { + assert!(start > self.path_start); + assert_eq!(self.byte_at(start), b'#'); + } + if let (Some(query_start), Some(fragment_start)) = (self.query_start, self.fragment_start) { + assert!(fragment_start > query_start); + } + } - /// The path of the URL, as vector of percent-encoded strings. - /// - /// Percent encoded strings are within the ASCII range. + /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) /// - /// See also the `serialize_path` method and, - /// for URLs in the `file` scheme, the `to_file_path` method. - pub path: Vec, -} - -impl RelativeSchemeData { - fn get_identity_key(&self) -> (&String, &Option, &Host, Option, Option, &Vec) { - ( - &self.username, - &self.password, - &self.host, - self.port.or(self.default_port), - self.default_port, - &self.path - ) + /// Note: this return an opaque origin for `file:` URLs, which causes + /// `url.origin() != url.origin()`. + #[inline] + pub fn origin(&self) -> Origin { + origin::url_origin(self) } -} - -impl PartialEq for RelativeSchemeData { - fn eq(&self, other: &RelativeSchemeData) -> bool { - self.get_identity_key() == other.get_identity_key() + /// Return the scheme of this URL, lower-cased, as an ASCII string without the ':' delimiter. + #[inline] + pub fn scheme(&self) -> &str { + self.slice(..self.scheme_end) } -} - -impl Eq for RelativeSchemeData {} -impl Hash for RelativeSchemeData { - fn hash(&self, state: &mut H) { - self.get_identity_key().hash(state) + /// Return whether the URL has an 'authority', + /// which can contain a username, password, host, and port number. + /// + /// URLs that do *not* are either path-only like `unix:/run/foo.socket` + /// or cannot-be-a-base like `data:text/plain,Stuff`. + #[inline] + pub fn has_authority(&self) -> bool { + debug_assert!(self.byte_at(self.scheme_end) == b':'); + self.slice(self.scheme_end..).starts_with("://") } -} -impl PartialOrd for RelativeSchemeData { - fn partial_cmp(&self, other: &RelativeSchemeData) -> Option { - self.get_identity_key().partial_cmp(&other.get_identity_key()) + /// Return whether this URL is a cannot-be-a-base URL, + /// meaning that parsing a relative URL string with this URL as the base will return an error. + /// + /// This is the case if the scheme and `:` delimiter are not followed by a `/` slash, + /// as is typically the case of `data:` and `mailto:` URLs. + #[inline] + pub fn cannot_be_a_base(&self) -> bool { + self.byte_at(self.path_start) != b'/' } -} -impl Ord for RelativeSchemeData { - fn cmp(&self, other: &Self) -> Ordering { - self.get_identity_key().cmp(&other.get_identity_key()) + /// Return the username for this URL (typically the empty string) + /// as a percent-encoded ASCII string. + pub fn username(&self) -> &str { + if self.has_authority() { + self.slice(self.scheme_end + ("://".len() as u32)..self.username_end) + } else { + "" + } } -} - -impl str::FromStr for Url { - type Err = ParseError; - fn from_str(url: &str) -> ParseResult { - Url::parse(url) + /// Return the password for this URL, if any, as a percent-encoded ASCII string. + pub fn password(&self) -> Option<&str> { + // This ':' is not the one marking a port number since a host can not be empty. + // (Except for file: URLs, which do not have port numbers.) + if self.has_authority() && self.byte_at(self.username_end) == b':' { + debug_assert!(self.byte_at(self.host_start - 1) == b'@'); + Some(self.slice(self.username_end + 1..self.host_start - 1)) + } else { + None + } } -} -/// A set of optional parameters for URL parsing. -pub struct UrlParser<'a> { - base_url: Option<&'a Url>, - query_encoding_override: EncodingOverride, - error_handler: ErrorHandler, - scheme_type_mapper: fn(scheme: &str) -> SchemeType, -} + /// Equivalent to `url.host().is_some()`. + pub fn has_host(&self) -> bool { + !matches!(self.host, HostInternal::None) + } + /// Return the string representation of the host (domain or IP address) for this URL, if any. + /// + /// Non-ASCII domains are punycode-encoded per IDNA. + /// IPv6 addresses are given between `[` and `]` brackets. + /// + /// Cannot-be-a-base URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// don’t have a host. + /// + /// See also the `host` method. + pub fn host_str(&self) -> Option<&str> { + if self.has_host() { + Some(self.slice(self.host_start..self.host_end)) + } else { + None + } + } -/// A method-chaining API to provide a set of optional parameters for URL parsing. -impl<'a> UrlParser<'a> { - /// Return a new UrlParser with default parameters. - #[inline] - pub fn new() -> UrlParser<'a> { - fn silent_handler(_reason: ParseError) -> ParseResult<()> { Ok(()) } - UrlParser { - base_url: None, - query_encoding_override: EncodingOverride::utf8(), - error_handler: silent_handler, - scheme_type_mapper: whatwg_scheme_type_mapper, + /// Return the parsed representation of the host for this URL. + /// Non-ASCII domain labels are punycode-encoded per IDNA. + /// + /// Cannot-be-a-base URLs (typical of `data:` and `mailto:`) and some `file:` URLs + /// don’t have a host. + /// + /// See also the `host_str` method. + pub fn host(&self) -> Option> { + match self.host { + HostInternal::None => None, + HostInternal::Domain => Some(Host::Domain(self.slice(self.host_start..self.host_end))), + HostInternal::Ipv4(address) => Some(Host::Ipv4(address)), + HostInternal::Ipv6(address) => Some(Host::Ipv6(address)), } } - /// Set the base URL used for resolving relative URL references, and return the `UrlParser`. - /// The default is no base URL, so that relative URLs references fail to parse. - #[inline] - pub fn base_url<'b>(&'b mut self, value: &'a Url) -> &'b mut UrlParser<'a> { - self.base_url = Some(value); - self + /// If this URL has a host and it is a domain name (not an IP address), return it. + pub fn domain(&self) -> Option<&str> { + match self.host { + HostInternal::Domain => Some(self.slice(self.host_start..self.host_end)), + _ => None, + } } - /// Set the character encoding the query string is encoded as before percent-encoding, - /// and return the `UrlParser`. - /// - /// This legacy quirk is only relevant to HTML. - /// - /// This method is only available if the `query_encoding` Cargo feature is enabled. - #[cfg(feature = "query_encoding")] + /// Return the port number for this URL, if any. #[inline] - pub fn query_encoding_override<'b>(&'b mut self, value: encoding::EncodingRef) - -> &'b mut UrlParser<'a> { - self.query_encoding_override = EncodingOverride::from_encoding(value); - self + pub fn port(&self) -> Option { + self.port } - /// Set an error handler for non-fatal parse errors, and return the `UrlParser`. + /// Return the port number for this URL, or the default port number if it is known. /// - /// Non-fatal parse errors are normally ignored by the parser, - /// but indicate violations of authoring requirements. - /// An error handler can be used, for example, to log these errors in the console - /// of a browser’s developer tools. + /// This method only knows the default port number + /// of the `http`, `https`, `ws`, `wss`, `ftp`, and `gopher` schemes. /// - /// The error handler can choose to make the error fatal by returning `Err(..)` + /// For URLs in these schemes, this method always returns `Some(_)`. + /// For other schemes, it is the same as `Url::port()`. #[inline] - pub fn error_handler<'b>(&'b mut self, value: ErrorHandler) -> &'b mut UrlParser<'a> { - self.error_handler = value; - self + pub fn port_or_known_default(&self) -> Option { + self.port.or_else(|| parser::default_port(self.scheme())) } - /// Set a *scheme type mapper*, and return the `UrlParser`. + /// If the URL has a host, return something that implements `ToSocketAddrs`. /// - /// The URL parser behaves differently based on the `SchemeType` of the URL. - /// See the documentation for `SchemeType` for more details. - /// A *scheme type mapper* returns a `SchemeType` - /// based on the scheme as an ASCII lower case string, - /// as found in the `scheme` field of an `Url` struct. + /// If the URL has no port number and the scheme’s default port number is not known + /// (see `Url::port_or_known_default`), + /// the closure is called to obtain a port number. + /// Typically, this closure can match on the result `Url::scheme` + /// to have per-scheme default port numbers, + /// and panic for schemes it’s not prepared to handle. + /// For example: /// - /// The default scheme type mapper is as follows: + /// ```rust + /// # use url::Url; + /// # use std::net::TcpStream; + /// # use std::io; /// - /// ``` - /// # use url::SchemeType; - /// fn whatwg_scheme_type_mapper(scheme: &str) -> SchemeType { - /// match scheme { - /// "file" => SchemeType::FileLike, - /// "ftp" => SchemeType::Relative(21), - /// "gopher" => SchemeType::Relative(70), - /// "http" => SchemeType::Relative(80), - /// "https" => SchemeType::Relative(443), - /// "ws" => SchemeType::Relative(80), - /// "wss" => SchemeType::Relative(443), - /// _ => SchemeType::NonRelative, + /// fn connect(url: &Url) -> io::Result { + /// TcpStream::connect(try!(url.with_default_port(default_port))) + /// } + /// + /// fn default_port(url: &Url) -> Result { + /// match url.scheme() { + /// "git" => Ok(9418), + /// "git+ssh" => Ok(22), + /// "git+https" => Ok(443), + /// "git+http" => Ok(80), + /// _ => Err(()), /// } /// } /// ``` + pub fn with_default_port(&self, f: F) -> io::Result> + where F: FnOnce(&Url) -> Result { + Ok(HostAndPort { + host: try!(self.host() + .ok_or(()) + .or_else(|()| io_error("URL has no host"))), + port: try!(self.port_or_known_default() + .ok_or(()) + .or_else(|()| f(self)) + .or_else(|()| io_error("URL has no port number"))) + }) + } + + /// Return the path for this URL, as a percent-encoded ASCII string. + /// For cannot-be-a-base URLs, this is an arbitrary string that doesn’t start with '/'. + /// For other URLs, this starts with a '/' slash + /// and continues with slash-separated path segments. + pub fn path(&self) -> &str { + match (self.query_start, self.fragment_start) { + (None, None) => self.slice(self.path_start..), + (Some(next_component_start), _) | + (None, Some(next_component_start)) => { + self.slice(self.path_start..next_component_start) + } + } + } + + /// Unless this URL is cannot-be-a-base, + /// return an iterator of '/' slash-separated path segments, + /// each as a percent-encoded ASCII string. /// - /// Note that unknown schemes default to non-relative. - /// Overriding the scheme type mapper can allow, for example, - /// parsing URLs in the `git` or `irc` scheme as relative. - #[inline] - pub fn scheme_type_mapper<'b>(&'b mut self, value: fn(scheme: &str) -> SchemeType) - -> &'b mut UrlParser<'a> { - self.scheme_type_mapper = value; - self + /// Return `None` for cannot-be-a-base URLs, or an iterator of at least one string. + pub fn path_segments(&self) -> Option> { + let path = self.path(); + if path.starts_with('/') { + Some(path[1..].split('/')) + } else { + None + } + } + + /// Return this URL’s query string, if any, as a percent-encoded ASCII string. + pub fn query(&self) -> Option<&str> { + match (self.query_start, self.fragment_start) { + (None, _) => None, + (Some(query_start), None) => { + debug_assert!(self.byte_at(query_start) == b'?'); + Some(self.slice(query_start + 1..)) + } + (Some(query_start), Some(fragment_start)) => { + debug_assert!(self.byte_at(query_start) == b'?'); + Some(self.slice(query_start + 1..fragment_start)) + } + } } - /// Parse `input` as an URL, with all the parameters previously set in the `UrlParser`. + /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` + /// and return an iterator of (key, value) pairs. #[inline] - pub fn parse(&self, input: &str) -> ParseResult { - parser::parse_url(input, self) + pub fn query_pairs(&self) -> form_urlencoded::Parse { + form_urlencoded::parse(self.query().unwrap_or("").as_bytes()) } - /// Parse `input` as a “standalone” URL path, - /// with an optional query string and fragment identifier. - /// - /// This is typically found in the start line of an HTTP header. - /// - /// Note that while the start line has no fragment identifier in the HTTP RFC, - /// servers typically parse it and ignore it - /// (rather than having it be part of the path or query string.) + /// Return this URL’s fragment identifier, if any. /// - /// On success, return `(path, query_string, fragment_identifier)` - #[inline] - pub fn parse_path(&self, input: &str) - -> ParseResult<(Vec, Option, Option)> { - parser::parse_standalone_path(input, self) + /// **Note:** the parser did *not* percent-encode this component, + /// but the input may have been percent-encoded already. + pub fn fragment(&self) -> Option<&str> { + self.fragment_start.map(|start| { + debug_assert!(self.byte_at(start) == b'#'); + self.slice(start + 1..) + }) } -} + fn mutate R, R>(&mut self, f: F) -> R { + let mut parser = Parser::for_setter(mem::replace(&mut self.serialization, String::new())); + let result = f(&mut parser); + self.serialization = parser.serialization; + result + } -/// Parse `input` as a “standalone” URL path, -/// with an optional query string and fragment identifier. -/// -/// This is typically found in the start line of an HTTP header. -/// -/// Note that while the start line has no fragment identifier in the HTTP RFC, -/// servers typically parse it and ignore it -/// (rather than having it be part of the path or query string.) -/// -/// On success, return `(path, query_string, fragment_identifier)` -/// -/// ```rust -/// let (path, query, fragment) = url::parse_path("/foo/bar/../baz?q=42").unwrap(); -/// assert_eq!(path, vec!["foo".to_string(), "baz".to_string()]); -/// assert_eq!(query, Some("q=42".to_string())); -/// assert_eq!(fragment, None); -/// ``` -/// -/// The query string returned by `url::parse_path` can be decoded with -/// `url::form_urlencoded::parse`. -#[inline] -pub fn parse_path(input: &str) - -> ParseResult<(Vec, Option, Option)> { - UrlParser::new().parse_path(input) -} + /// Change this URL’s fragment identifier. + pub fn set_fragment(&mut self, fragment: Option<&str>) { + // Remove any previous fragment + if let Some(start) = self.fragment_start { + debug_assert!(self.byte_at(start) == b'#'); + self.serialization.truncate(start as usize); + } + // Write the new one + if let Some(input) = fragment { + self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('#'); + self.mutate(|parser| parser.parse_fragment(input)) + } else { + self.fragment_start = None + } + } + fn take_fragment(&mut self) -> Option { + self.fragment_start.take().map(|start| { + debug_assert!(self.byte_at(start) == b'#'); + let fragment = self.slice(start + 1..).to_owned(); + self.serialization.truncate(start as usize); + fragment + }) + } -/// Private convenience methods for use in parser.rs -impl<'a> UrlParser<'a> { - #[inline] - fn parse_error(&self, error: ParseError) -> ParseResult<()> { - (self.error_handler)(error) + fn restore_already_parsed_fragment(&mut self, fragment: Option) { + if let Some(ref fragment) = fragment { + assert!(self.fragment_start.is_none()); + self.fragment_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('#'); + self.serialization.push_str(fragment); + } } - #[inline] - fn get_scheme_type(&self, scheme: &str) -> SchemeType { - (self.scheme_type_mapper)(scheme) + /// Change this URL’s query string. + pub fn set_query(&mut self, query: Option<&str>) { + let fragment = self.take_fragment(); + + // Remove any previous query + if let Some(start) = self.query_start.take() { + debug_assert!(self.byte_at(start) == b'?'); + self.serialization.truncate(start as usize); + } + // Write the new query, if any + if let Some(input) = query { + self.query_start = Some(to_u32(self.serialization.len()).unwrap()); + self.serialization.push('?'); + let scheme_end = self.scheme_end; + self.mutate(|parser| parser.parse_query(scheme_end, input)); + } + + self.restore_already_parsed_fragment(fragment); } -} + /// Manipulate this URL’s query string, viewed as a sequence of name/value pairs + /// in `application/x-www-form-urlencoded` syntax. + /// + /// The return value has a method-chaining API: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://example.net?lang=fr#nav").unwrap(); + /// assert_eq!(url.query(), Some("lang=fr")); + /// + /// url.mutate_query_pairs().append_pair("foo", "bar"); + /// assert_eq!(url.query(), Some("lang=fr&foo=bar")); + /// assert_eq!(url.as_str(), "https://example.net/?lang=fr&foo=bar#nav"); + /// + /// url.mutate_query_pairs() + /// .clear() + /// .append_pair("foo", "bar & baz") + /// .append_pair("saisons", "Été+hiver"); + /// assert_eq!(url.query(), Some("foo=bar+%26+baz&saisons=%C3%89t%C3%A9%2Bhiver")); + /// assert_eq!(url.as_str(), + /// "https://example.net/?foo=bar+%26+baz&saisons=%C3%89t%C3%A9%2Bhiver#nav"); + /// ``` + /// + /// Note: `url.mutate_query_pairs().clear();` is equivalent to `url.set_query(Some(""))`, + /// not `url.set_query(None)`. + /// + /// The state of `Url` is unspecified if this return value is leaked without being dropped. + pub fn mutate_query_pairs(&mut self) -> form_urlencoded::Serializer { + let fragment = self.take_fragment(); + + let query_start; + if let Some(start) = self.query_start { + debug_assert!(self.byte_at(start) == b'?'); + query_start = start as usize; + } else { + query_start = self.serialization.len(); + self.query_start = Some(to_u32(query_start).unwrap()); + self.serialization.push('?'); + } + + let query = UrlQuery { url: self, fragment: fragment }; + form_urlencoded::Serializer::for_suffix(query, query_start + "?".len()) + } + + /// Change this URL’s path. + pub fn set_path(&mut self, path: &str) { + let (old_after_path_pos, after_path) = match (self.query_start, self.fragment_start) { + (Some(i), _) | (None, Some(i)) => (i, self.slice(i..).to_owned()), + (None, None) => (to_u32(self.serialization.len()).unwrap(), String::new()) + }; + let cannot_be_a_base = self.cannot_be_a_base(); + let scheme_type = SchemeType::from(self.scheme()); + self.serialization.truncate(self.path_start as usize); + self.mutate(|parser| { + if cannot_be_a_base { + if path.starts_with('/') { + parser.serialization.push_str("%2F"); + parser.parse_cannot_be_a_base_path(&path[1..]); + } else { + parser.parse_cannot_be_a_base_path(path); + } + } else { + let mut has_host = true; // FIXME + parser.parse_path_start(scheme_type, &mut has_host, path); + } + }); + let new_after_path_pos = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_after_path_pos; + *index += new_after_path_pos; + }; + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + self.serialization.push_str(&after_path) + } + + /// Remove the last segment of this URL’s path. + /// + /// If this URL is cannot-be-a-base, do nothing and return `Err`. + /// If this URL is not cannot-be-a-base and its path is `/`, do nothing and return `Ok`. + pub fn pop_path_segment(&mut self) -> Result<(), ()> { + if self.cannot_be_a_base() { + return Err(()) + } + let last_slash; + let path_len; + { + let path = self.path(); + last_slash = path.rfind('/').unwrap(); + path_len = path.len(); + }; + if last_slash > 0 { + // Found a slash other than the initial one + let last_slash = last_slash + self.path_start as usize; + let path_end = path_len + self.path_start as usize; + self.serialization.drain(last_slash..path_end); + let offset = (path_end - last_slash) as u32; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) + } -/// Determines the behavior of the URL parser for a given scheme. -#[derive(PartialEq, Eq, Copy, Debug, Clone, Hash, PartialOrd, Ord)] -pub enum SchemeType { - /// Indicate that the scheme is *non-relative*. + /// Add a segment at the end of this URL’s path. /// - /// The *scheme data* of the URL - /// (everything other than the scheme, query string, and fragment identifier) - /// is parsed as a single percent-encoded string of which no structure is assumed. - /// That string may need to be parsed further, per a scheme-specific format. - NonRelative, + /// If this URL is cannot-be-a-base, do nothing and return `Err`. + pub fn push_path_segment(&mut self, segment: &str) -> Result<(), ()> { + if self.cannot_be_a_base() { + return Err(()) + } + let after_path = match (self.query_start, self.fragment_start) { + (Some(i), _) | (None, Some(i)) => { + let s = self.slice(i..).to_owned(); + self.serialization.truncate(i as usize); + s + }, + (None, None) => String::new() + }; + let scheme_type = SchemeType::from(self.scheme()); + let path_start = self.path_start as usize; + self.serialization.push('/'); + self.mutate(|parser| { + parser.context = parser::Context::PathSegmentSetter; + let mut has_host = true; // FIXME account for this? + parser.parse_path(scheme_type, &mut has_host, path_start, segment) + }); + let offset = to_u32(self.serialization.len()).unwrap() - self.path_start; + if let Some(ref mut index) = self.query_start { *index += offset } + if let Some(ref mut index) = self.fragment_start { *index += offset } + self.serialization.push_str(&after_path); + Ok(()) + } - /// Indicate that the scheme is *relative*, and what the default port number is. + /// Change this URL’s port number. /// - /// The *scheme data* is structured as - /// *username*, *password*, *host*, *port number*, and *path*. - /// Relative URL references are supported, if a base URL was given. - /// The string value indicates the default port number as a string of ASCII digits, - /// or the empty string to indicate no default port number. - Relative(u16), - - /// Indicate a *relative* scheme similar to the *file* scheme. + /// If this URL is cannot-be-a-base, does not have a host, or has the `file` scheme; + /// do nothing and return `Err`. + pub fn set_port(&mut self, mut port: Option) -> Result<(), ()> { + if !self.has_host() || self.scheme() == "file" { + return Err(()) + } + if port.is_some() && port == parser::default_port(self.scheme()) { + port = None + } + self.set_port_internal(port); + Ok(()) + } + + fn set_port_internal(&mut self, port: Option) { + match (self.port, port) { + (None, None) => {} + (Some(_), None) => { + self.serialization.drain(self.host_end as usize .. self.path_start as usize); + let offset = self.path_start - self.host_end; + self.path_start = self.host_end; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + (Some(old), Some(new)) if old == new => {} + (_, Some(new)) => { + let path_and_after = self.slice(self.path_start..).to_owned(); + self.serialization.truncate(self.host_end as usize); + write!(&mut self.serialization, ":{}", new).unwrap(); + let old_path_start = self.path_start; + let new_path_start = to_u32(self.serialization.len()).unwrap(); + self.path_start = new_path_start; + let adjust = |index: &mut u32| { + *index -= old_path_start; + *index += new_path_start; + }; + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + self.serialization.push_str(&path_and_after); + } + } + self.port = port; + } + + /// Change this URL’s host. /// - /// For example, you might want to have distinct `git+file` and `hg+file` URL schemes. + /// If this URL is cannot-be-a-base or there is an error parsing the given `host`, + /// do nothing and return `Err`. /// - /// This is like `Relative` except the host can be empty, there is no port number, - /// and path parsing has (platform-independent) quirks to support Windows filenames. - FileLike, -} + /// Removing the host (calling this with `None`) + /// will also remove any username, password, and port number. + pub fn set_host(&mut self, host: Option<&str>) -> Result<(), ParseError> { + if self.cannot_be_a_base() { + return Err(ParseError::SetHostOnCannotBeABaseUrl) + } -impl SchemeType { - pub fn default_port(&self) -> Option { - match *self { - SchemeType::Relative(default_port) => Some(default_port), - _ => None, + if let Some(host) = host { + self.set_host_internal(try!(Host::parse(host)), None) + } else if self.has_host() { + debug_assert!(self.byte_at(self.scheme_end) == b':'); + debug_assert!(self.byte_at(self.path_start) == b'/'); + let new_path_start = self.scheme_end + 1; + self.serialization.drain(self.path_start as usize..new_path_start as usize); + let offset = self.path_start - new_path_start; + self.path_start = new_path_start; + self.username_end = new_path_start; + self.host_start = new_path_start; + self.host_end = new_path_start; + self.port = None; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) + } + + /// opt_new_port: None means leave unchanged, Some(None) means remove any port number. + fn set_host_internal(&mut self, host: Host, opt_new_port: Option>) { + let old_suffix_pos = if opt_new_port.is_some() { self.path_start } else { self.host_end }; + let suffix = self.slice(old_suffix_pos..).to_owned(); + self.serialization.truncate(self.host_start as usize); + if !self.has_host() { + debug_assert!(self.slice(self.scheme_end..self.host_start) == ":"); + debug_assert!(self.username_end == self.host_start); + self.serialization.push('/'); + self.serialization.push('/'); + self.username_end += 2; + self.host_start += 2; + } + write!(&mut self.serialization, "{}", host).unwrap(); + self.host_end = to_u32(self.serialization.len()).unwrap(); + self.host = host.into(); + + if let Some(new_port) = opt_new_port { + self.port = new_port; + if let Some(port) = new_port { + write!(&mut self.serialization, ":{}", port).unwrap(); + } } + let new_suffix_pos = to_u32(self.serialization.len()).unwrap(); + self.serialization.push_str(&suffix); + + let adjust = |index: &mut u32| { + *index -= old_suffix_pos; + *index += new_suffix_pos; + }; + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } } - pub fn same_as(&self, other: SchemeType) -> bool { - match (self, other) { - (&SchemeType::NonRelative, SchemeType::NonRelative) => true, - (&SchemeType::Relative(_), SchemeType::Relative(_)) => true, - (&SchemeType::FileLike, SchemeType::FileLike) => true, - _ => false + + /// Change this URL’s host to the given IP address. + /// + /// If this URL is cannot-be-a-base, do nothing and return `Err`. + /// + /// Compared to `Url::set_host`, this skips the host parser. + pub fn set_ip_host(&mut self, address: IpAddr) -> Result<(), ()> { + if self.cannot_be_a_base() { + return Err(()) } + + let address = match address { + IpAddr::V4(address) => Host::Ipv4(address), + IpAddr::V6(address) => Host::Ipv6(address), + }; + self.set_host_internal(address, None); + Ok(()) } -} -/// http://url.spec.whatwg.org/#special-scheme -pub fn whatwg_scheme_type_mapper(scheme: &str) -> SchemeType { - match scheme { - "file" => SchemeType::FileLike, - "ftp" => SchemeType::Relative(21), - "gopher" => SchemeType::Relative(70), - "http" => SchemeType::Relative(80), - "https" => SchemeType::Relative(443), - "ws" => SchemeType::Relative(80), - "wss" => SchemeType::Relative(443), - _ => SchemeType::NonRelative, + /// Change this URL’s password. + /// + /// If this URL is cannot-be-a-base or does not have a host, do nothing and return `Err`. + pub fn set_password(&mut self, password: Option<&str>) -> Result<(), ()> { + if !self.has_host() { + return Err(()) + } + if let Some(password) = password { + let host_and_after = self.slice(self.host_start..).to_owned(); + self.serialization.truncate(self.username_end as usize); + self.serialization.push(':'); + self.serialization.extend(utf8_percent_encode(password, USERINFO_ENCODE_SET)); + self.serialization.push('@'); + + let old_host_start = self.host_start; + let new_host_start = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_host_start; + *index += new_host_start; + }; + self.host_start = new_host_start; + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + + self.serialization.push_str(&host_and_after); + } else if self.byte_at(self.username_end) == b':' { // If there is a password to remove + let has_username_or_password = self.byte_at(self.host_start - 1) == b'@'; + debug_assert!(has_username_or_password); + let username_start = self.scheme_end + 3; + let empty_username = username_start == self.username_end; + let start = self.username_end; // Remove the ':' + let end = if empty_username { + self.host_start // Remove the '@' as well + } else { + self.host_start - 1 // Keep the '@' to separate the username from the host + }; + self.serialization.drain(start as usize .. end as usize); + let offset = end - start; + self.host_start -= offset; + self.host_end -= offset; + self.path_start -= offset; + if let Some(ref mut index) = self.query_start { *index -= offset } + if let Some(ref mut index) = self.fragment_start { *index -= offset } + } + Ok(()) } -} + /// Change this URL’s username. + /// + /// If this URL is cannot-be-a-base or does not have a host, do nothing and return `Err`. + pub fn set_username(&mut self, username: &str) -> Result<(), ()> { + if !self.has_host() { + return Err(()) + } + let username_start = self.scheme_end + 3; + debug_assert!(self.slice(self.scheme_end..username_start) == "://"); + if self.slice(username_start..self.username_end) == username { + return Ok(()) + } + let after_username = self.slice(self.username_end..).to_owned(); + self.serialization.truncate(username_start as usize); + self.serialization.extend(utf8_percent_encode(username, USERINFO_ENCODE_SET)); + + let mut removed_bytes = self.username_end; + self.username_end = to_u32(self.serialization.len()).unwrap(); + let mut added_bytes = self.username_end; + + let new_username_is_empty = self.username_end == username_start; + match (new_username_is_empty, after_username.chars().next()) { + (true, Some('@')) => { + removed_bytes += 1; + self.serialization.push_str(&after_username[1..]); + } + (false, Some('@')) | (_, Some(':')) | (true, _) => { + self.serialization.push_str(&after_username); + } + (false, _) => { + added_bytes += 1; + self.serialization.push('@'); + self.serialization.push_str(&after_username); + } + } -impl Url { - /// Parse an URL with the default `UrlParser` parameters. + let adjust = |index: &mut u32| { + *index -= removed_bytes; + *index += added_bytes; + }; + adjust(&mut self.host_start); + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + Ok(()) + } + + /// Change this URL’s scheme. /// - /// In particular, relative URL references are parse errors since no base URL is provided. - #[inline] - pub fn parse(input: &str) -> ParseResult { - UrlParser::new().parse(input) + /// Do nothing and return `Err` if: + /// * The new scheme is not in `[a-zA-Z][a-zA-Z0-9+.-]+` + /// * This URL is cannot-be-a-base and the new scheme is one of + /// `http`, `https`, `ws`, `wss`, `ftp`, or `gopher` + pub fn set_scheme(&mut self, scheme: &str) -> Result<(), ()> { + let mut parser = Parser::for_setter(String::new()); + let remaining = try!(parser.parse_scheme(scheme)); + if !remaining.is_empty() || + (!self.has_host() && SchemeType::from(&parser.serialization).is_special()) { + return Err(()) + } + let old_scheme_end = self.scheme_end; + let new_scheme_end = to_u32(parser.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_scheme_end; + *index += new_scheme_end; + }; + + self.scheme_end = new_scheme_end; + adjust(&mut self.username_end); + adjust(&mut self.host_start); + adjust(&mut self.host_end); + adjust(&mut self.path_start); + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + + parser.serialization.push_str(self.slice(old_scheme_end..)); + self.serialization = parser.serialization; + Ok(()) } /// Convert a file name as `std::path::Path` into an URL in the `file` scheme. /// - /// This returns `Err` if the given path is not absolute - /// or, with a Windows path, if the prefix is not a disk prefix (e.g. `C:`). + /// This returns `Err` if the given path is not absolute or, + /// on Windows, if the prefix is not a disk prefix (e.g. `C:`). pub fn from_file_path>(path: P) -> Result { - let path = try!(path_to_file_url_path(path.as_ref())); - Ok(Url::from_path_common(path)) + let mut serialization = "file://".to_owned(); + let path_start = serialization.len() as u32; + try!(path_to_file_url_segments(path.as_ref(), &mut serialization)); + Ok(Url { + serialization: serialization, + scheme_end: "file".len() as u32, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: None, + }) } /// Convert a directory name as `std::path::Path` into an URL in the `file` scheme. /// - /// This returns `Err` if the given path is not absolute - /// or, with a Windows path, if the prefix is not a disk prefix (e.g. `C:`). + /// This returns `Err` if the given path is not absolute or, + /// on Windows, if the prefix is not a disk prefix (e.g. `C:`). /// - /// Compared to `from_file_path`, this adds an empty component to the path - /// (or, in terms of URL syntax, adds a trailing slash) + /// Compared to `from_file_path`, this ensure that URL’s the path has a trailing slash /// so that the entire path is considered when using this URL as a base URL. /// /// For example: /// /// * `"index.html"` parsed with `Url::from_directory_path(Path::new("/var/www"))` /// as the base URL is `file:///var/www/index.html` - /// * `"index.html"` parsed with `Url::from_file_path(Path::new("/var/www/"))` + /// * `"index.html"` parsed with `Url::from_file_path(Path::new("/var/www"))` /// as the base URL is `file:///var/index.html`, which might not be what was intended. /// - /// (Note that `Path::new` removes any trailing slash.) + /// Note that `std::path` does not consider trailing slashes significant + /// and usually does not include them (e.g. in `Path::parent()`). pub fn from_directory_path>(path: P) -> Result { - let mut path = try!(path_to_file_url_path(path.as_ref())); - // Add an empty path component (i.e. a trailing slash in serialization) - // so that the entire path is used as a base URL. - path.push("".to_owned()); - Ok(Url::from_path_common(path)) - } - - fn from_path_common(path: Vec) -> Url { - Url { - scheme: "file".to_owned(), - scheme_data: SchemeData::Relative(RelativeSchemeData { - username: "".to_owned(), - password: None, - port: None, - default_port: None, - host: Host::Domain("".to_owned()), - path: path, - }), - query: None, - fragment: None, + let mut url = try!(Url::from_file_path(path)); + if !url.serialization.ends_with('/') { + url.serialization.push('/') } + Ok(url) } /// Assuming the URL is in the `file` scheme or similar, @@ -634,255 +1092,140 @@ impl Url { /// for a Windows path, is not UTF-8.) #[inline] pub fn to_file_path(&self) -> Result { - match self.scheme_data { - SchemeData::Relative(ref scheme_data) => scheme_data.to_file_path(), - SchemeData::NonRelative(..) => Err(()), - } - } - - /// Return the serialization of this URL as a string. - pub fn serialize(&self) -> String { - self.to_string() - } - - /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) - pub fn origin(&self) -> Origin { - match &*self.scheme { - "blob" => { - let result = Url::parse(self.non_relative_scheme_data().unwrap()); - match result { - Ok(ref url) => url.origin(), - Err(_) => Origin::UID(OpaqueOrigin::new()) - } - }, - "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { - Origin::Tuple(self.scheme.clone(), self.host().unwrap().clone(), - self.port_or_default().unwrap()) - }, - // TODO: Figure out what to do if the scheme is a file - "file" => Origin::UID(OpaqueOrigin::new()), - _ => Origin::UID(OpaqueOrigin::new()) - } - } - - /// Return the serialization of this URL, without the fragment identifier, as a string - pub fn serialize_no_fragment(&self) -> String { - UrlNoFragmentFormatter{ url: self }.to_string() - } - - /// If the URL is *non-relative*, return the string scheme data. - #[inline] - pub fn non_relative_scheme_data(&self) -> Option<&str> { - match self.scheme_data { - SchemeData::Relative(..) => None, - SchemeData::NonRelative(ref scheme_data) => Some(scheme_data), - } - } - - /// If the URL is *non-relative*, return a mutable reference to the string scheme data. - #[inline] - pub fn non_relative_scheme_data_mut(&mut self) -> Option<&mut String> { - match self.scheme_data { - SchemeData::Relative(..) => None, - SchemeData::NonRelative(ref mut scheme_data) => Some(scheme_data), - } - } - - /// If the URL is in a *relative scheme*, return the structured scheme data. - #[inline] - pub fn relative_scheme_data(&self) -> Option<&RelativeSchemeData> { - match self.scheme_data { - SchemeData::Relative(ref scheme_data) => Some(scheme_data), - SchemeData::NonRelative(..) => None, - } - } - - /// If the URL is in a *relative scheme*, - /// return a mutable reference to the structured scheme data. - #[inline] - pub fn relative_scheme_data_mut(&mut self) -> Option<&mut RelativeSchemeData> { - match self.scheme_data { - SchemeData::Relative(ref mut scheme_data) => Some(scheme_data), - SchemeData::NonRelative(..) => None, + // FIXME: Figure out what to do w.r.t host. + if matches!(self.host(), None | Some(Host::Domain("localhost"))) { + if let Some(segments) = self.path_segments() { + return file_url_segments_to_pathbuf(segments) + } } + Err(()) } - /// If the URL is in a *relative scheme*, return its username. - #[inline] - pub fn username(&self) -> Option<&str> { - self.relative_scheme_data().map(|scheme_data| &*scheme_data.username) - } - - /// If the URL is in a *relative scheme*, return a mutable reference to its username. - #[inline] - pub fn username_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.username) - } - - /// Percent-decode the URL’s username, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_username(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.lossy_percent_decode_username()) - } - - /// If the URL is in a *relative scheme*, return its password, if any. - #[inline] - pub fn password(&self) -> Option<&str> { - self.relative_scheme_data().and_then(|scheme_data| - scheme_data.password.as_ref().map(|password| password as &str)) - } - - /// If the URL is in a *relative scheme*, return a mutable reference to its password, if any. - #[inline] - pub fn password_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().and_then(|scheme_data| scheme_data.password.as_mut()) - } + // Private helper methods: - /// Percent-decode the URL’s password, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. #[inline] - pub fn lossy_percent_decode_password(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| - scheme_data.lossy_percent_decode_password()) + fn slice(&self, range: R) -> &str where R: RangeArg { + range.slice_of(&self.serialization) } - /// Serialize the URL's username and password, if any. - /// - /// Format: ":@" #[inline] - pub fn serialize_userinfo(&mut self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.serialize_userinfo()) + fn byte_at(&self, i: u32) -> u8 { + self.serialization.as_bytes()[i as usize] } +} - /// If the URL is in a *relative scheme*, return its structured host. - #[inline] - pub fn host(&self) -> Option<&Host> { - self.relative_scheme_data().map(|scheme_data| &scheme_data.host) - } +/// Return an error if `Url::host` or `Url::port_or_known_default` return `None`. +impl ToSocketAddrs for Url { + type Iter = SocketAddrs; - /// If the URL is in a *relative scheme*, return a mutable reference to its structured host. - #[inline] - pub fn host_mut(&mut self) -> Option<&mut Host> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.host) + fn to_socket_addrs(&self) -> io::Result { + try!(self.with_default_port(|_| Err(()))).to_socket_addrs() } +} - /// If the URL is in a *relative scheme* and its host is a domain, - /// return the domain as a string. - #[inline] - pub fn domain(&self) -> Option<&str> { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.domain()) - } +/// Parse a string as an URL, without a base URL or encoding override. +impl str::FromStr for Url { + type Err = ParseError; - /// If the URL is in a *relative scheme* and its host is a domain, - /// return a mutable reference to the domain string. #[inline] - pub fn domain_mut(&mut self) -> Option<&mut String> { - self.relative_scheme_data_mut().and_then(|scheme_data| scheme_data.domain_mut()) + fn from_str(input: &str) -> Result { + Url::parse(input) } +} - /// If the URL is in a *relative scheme*, serialize its host as a string. - /// - /// A domain a returned as-is, an IPv6 address between [] square brackets. +/// Display the serialization of this URL. +impl fmt::Display for Url { #[inline] - pub fn serialize_host(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.host.serialize()) + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + fmt::Display::fmt(&self.serialization, formatter) } +} - /// If the URL is in a *relative scheme* and has a port number, return it. +/// Debug the serialization of this URL. +impl fmt::Debug for Url { #[inline] - pub fn port(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.port) + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + fmt::Debug::fmt(&self.serialization, formatter) } +} - /// If the URL is in a *relative scheme*, return a mutable reference to its port. - #[inline] - pub fn port_mut(&mut self) -> Option<&mut Option> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.port) - } +/// URLs compare like their serialization. +impl Eq for Url {} - /// If the URL is in a *relative scheme* that is not a file-like, - /// return its port number, even if it is the default. +/// URLs compare like their serialization. +impl PartialEq for Url { #[inline] - pub fn port_or_default(&self) -> Option { - self.relative_scheme_data().and_then(|scheme_data| scheme_data.port_or_default()) + fn eq(&self, other: &Self) -> bool { + self.serialization == other.serialization } +} - /// If the URL is in a *relative scheme*, return its path components. +/// URLs compare like their serialization. +impl Ord for Url { #[inline] - pub fn path(&self) -> Option<&[String]> { - self.relative_scheme_data().map(|scheme_data| &*scheme_data.path) + fn cmp(&self, other: &Self) -> cmp::Ordering { + self.serialization.cmp(&other.serialization) } +} - /// If the URL is in a *relative scheme*, return a mutable reference to its path components. +/// URLs compare like their serialization. +impl PartialOrd for Url { #[inline] - pub fn path_mut(&mut self) -> Option<&mut Vec> { - self.relative_scheme_data_mut().map(|scheme_data| &mut scheme_data.path) + fn partial_cmp(&self, other: &Self) -> Option { + self.serialization.partial_cmp(&other.serialization) } +} - /// If the URL is in a *relative scheme*, serialize its path as a string. - /// - /// The returned string starts with a "/" slash, and components are separated by slashes. - /// A trailing slash represents an empty last component. +/// URLs hash like their serialization. +impl hash::Hash for Url { #[inline] - pub fn serialize_path(&self) -> Option { - self.relative_scheme_data().map(|scheme_data| scheme_data.serialize_path()) + fn hash(&self, state: &mut H) where H: hash::Hasher { + hash::Hash::hash(&self.serialization, state) } +} - /// Parse the URL’s query string, if any, as `application/x-www-form-urlencoded` - /// and return a vector of (key, value) pairs. +/// Return the serialization of this URL. +impl AsRef for Url { #[inline] - pub fn query_pairs(&self) -> Option> { - self.query.as_ref().map(|query| form_urlencoded::parse(query.as_bytes())) + fn as_ref(&self) -> &str { + &self.serialization } +} - /// Serialize an iterator of (key, value) pairs as `application/x-www-form-urlencoded` - /// and set it as the URL’s query string. - #[inline] - pub fn set_query_from_pairs(&mut self, pairs: I) - where I: IntoIterator, I::Item: Borrow<(K, V)>, K: AsRef, V: AsRef { - self.query = Some(form_urlencoded::serialize(pairs)); - } +trait RangeArg { + fn slice_of<'a>(&self, s: &'a str) -> &'a str; +} - /// Percent-decode the URL’s query string, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. +impl RangeArg for Range { #[inline] - pub fn lossy_percent_decode_query(&self) -> Option { - self.query.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start as usize .. self.end as usize] } +} - /// Percent-decode the URL’s fragment identifier, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. +impl RangeArg for RangeFrom { #[inline] - pub fn lossy_percent_decode_fragment(&self) -> Option { - self.fragment.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[self.start as usize ..] } +} - /// Join a path with a base URL. - /// - /// Corresponds to the basic URL parser where `self` is the given base URL. +impl RangeArg for RangeTo { #[inline] - pub fn join(&self, input: &str) -> ParseResult { - UrlParser::new().base_url(self).parse(input) + fn slice_of<'a>(&self, s: &'a str) -> &'a str { + &s[.. self.end as usize] } } - +#[cfg(feature="rustc-serialize")] impl rustc_serialize::Encodable for Url { fn encode(&self, encoder: &mut S) -> Result<(), S::Error> { - encoder.emit_str(&self.to_string()) + encoder.emit_str(self.as_str()) } } +#[cfg(feature="rustc-serialize")] impl rustc_serialize::Decodable for Url { fn decode(decoder: &mut D) -> Result { Url::parse(&*try!(decoder.read_str())).map_err(|error| { @@ -893,8 +1236,8 @@ impl rustc_serialize::Decodable for Url { /// Serializes this URL into a `serde` stream. /// -/// This implementation is only available if the `serde_serialization` Cargo feature is enabled. -#[cfg(feature="serde_serialization")] +/// This implementation is only available if the `serde` Cargo feature is enabled. +#[cfg(feature="serde")] impl serde::Serialize for Url { fn serialize(&self, serializer: &mut S) -> Result<(), S::Error> where S: serde::Serializer { format!("{}", self).serialize(serializer) @@ -903,179 +1246,38 @@ impl serde::Serialize for Url { /// Deserializes this URL from a `serde` stream. /// -/// This implementation is only available if the `serde_serialization` Cargo feature is enabled. -#[cfg(feature="serde_serialization")] +/// This implementation is only available if the `serde` Cargo feature is enabled. +#[cfg(feature="serde")] impl serde::Deserialize for Url { fn deserialize(deserializer: &mut D) -> Result where D: serde::Deserializer { let string_representation: String = try!(serde::Deserialize::deserialize(deserializer)); - Ok(FromStr::from_str(&string_representation[..]).unwrap()) + Ok(Url::parse(&string_representation).unwrap()) } } -impl fmt::Display for Url { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - try!(UrlNoFragmentFormatter{ url: self }.fmt(formatter)); - if let Some(ref fragment) = self.fragment { - try!(formatter.write_str("#")); - try!(formatter.write_str(fragment)); - } - Ok(()) - } -} - - -impl fmt::Display for SchemeData { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - match *self { - SchemeData::Relative(ref scheme_data) => scheme_data.fmt(formatter), - SchemeData::NonRelative(ref scheme_data) => scheme_data.fmt(formatter), - } - } -} - - -impl RelativeSchemeData { - /// Percent-decode the URL’s username. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_username(&self) -> String { - lossy_utf8_percent_decode(self.username.as_bytes()) - } - - /// Percent-decode the URL’s password, if any. - /// - /// This is “lossy”: invalid UTF-8 percent-encoded byte sequences - /// will be replaced � U+FFFD, the replacement character. - #[inline] - pub fn lossy_percent_decode_password(&self) -> Option { - self.password.as_ref().map(|value| lossy_utf8_percent_decode(value.as_bytes())) - } - - /// Assuming the URL is in the `file` scheme or similar, - /// convert its path to an absolute `std::path::Path`. - /// - /// **Note:** This does not actually check the URL’s `scheme`, - /// and may give nonsensical results for other schemes. - /// It is the user’s responsibility to check the URL’s scheme before calling this. - /// - /// ``` - /// # use url::Url; - /// # let url = Url::parse("file:///etc/passwd").unwrap(); - /// let path = url.to_file_path(); - /// ``` - /// - /// Returns `Err` if the host is neither empty nor `"localhost"`, - /// or if `Path::new_opt()` returns `None`. - /// (That is, if the percent-decoded path contains a NUL byte or, - /// for a Windows path, is not UTF-8.) - #[inline] - pub fn to_file_path(&self) -> Result { - // FIXME: Figure out what to do w.r.t host. - if !matches!(self.domain(), Some("") | Some("localhost")) { - return Err(()) - } - file_url_path_to_pathbuf(&self.path) - } - - /// If the host is a domain, return the domain as a string. - #[inline] - pub fn domain(&self) -> Option<&str> { - match self.host { - Host::Domain(ref domain) => Some(domain), - _ => None, - } - } - - /// If the host is a domain, return a mutable reference to the domain string. - #[inline] - pub fn domain_mut(&mut self) -> Option<&mut String> { - match self.host { - Host::Domain(ref mut domain) => Some(domain), - _ => None, - } - } - - /// Return the port number of the URL, even if it is the default. - /// Return `None` for file-like URLs. - #[inline] - pub fn port_or_default(&self) -> Option { - self.port.or(self.default_port) - } - - /// Serialize the path as a string. - /// - /// The returned string starts with a "/" slash, and components are separated by slashes. - /// A trailing slash represents an empty last component. - pub fn serialize_path(&self) -> String { - PathFormatter { - path: &self.path - }.to_string() - } - - /// Serialize the userinfo as a string. - /// - /// Format: ":@". - pub fn serialize_userinfo(&self) -> String { - UserInfoFormatter { - username: &self.username, - password: self.password.as_ref().map(|s| s as &str) - }.to_string() - } -} - - -impl fmt::Display for RelativeSchemeData { - fn fmt(&self, formatter: &mut Formatter) -> fmt::Result { - // Write the scheme-trailing double slashes. - try!(formatter.write_str("//")); - - // Write the user info. - try!(UserInfoFormatter { - username: &self.username, - password: self.password.as_ref().map(|s| s as &str) - }.fmt(formatter)); - - // Write the host. - try!(self.host.fmt(formatter)); - - // Write the port. - match self.port { - Some(port) => { - try!(write!(formatter, ":{}", port)); - }, - None => {} - } - - // Write the path. - PathFormatter { - path: &self.path - }.fmt(formatter) - } -} - - #[cfg(unix)] -fn path_to_file_url_path(path: &Path) -> Result, ()> { +fn path_to_file_url_segments(path: &Path, serialization: &mut String) -> Result<(), ()> { use std::os::unix::prelude::OsStrExt; if !path.is_absolute() { return Err(()) } // skip the root component - Ok(path.components().skip(1).map(|c| { - percent_encode(c.as_os_str().as_bytes(), DEFAULT_ENCODE_SET) - }).collect()) + for component in path.components().skip(1) { + serialization.push('/'); + serialization.extend(percent_encode( + component.as_os_str().as_bytes(), PATH_SEGMENT_ENCODE_SET)) + } + Ok(()) } #[cfg(windows)] -fn path_to_file_url_path(path: &Path) -> Result, ()> { - path_to_file_url_path_windows(path) +fn path_to_file_url_segments(path: &Path, serialization: &mut String) -> Result<(), ()> { + path_to_file_url_segments_windows(path, serialization) } // Build this unconditionally to alleviate https://github.com/servo/rust-url/issues/102 #[cfg_attr(not(windows), allow(dead_code))] -fn path_to_file_url_path_windows(path: &Path) -> Result, ()> { +fn path_to_file_url_segments_windows(path: &Path, serialization: &mut String) -> Result<(), ()> { use std::path::{Prefix, Component}; if !path.is_absolute() { return Err(()) @@ -1093,35 +1295,30 @@ fn path_to_file_url_path_windows(path: &Path) -> Result, ()> { }; // Start with the prefix, e.g. "C:" - let mut path = vec![format!("{}:", disk as char)]; + serialization.push('/'); + serialization.push(disk as char); + serialization.push(':'); for component in components { if component == Component::RootDir { continue } // FIXME: somehow work with non-unicode? - let part = match component.as_os_str().to_str() { - Some(s) => s, - None => return Err(()), - }; - path.push(percent_encode(part.as_bytes(), DEFAULT_ENCODE_SET)); + let component = try!(component.as_os_str().to_str().ok_or(())); + serialization.push('/'); + serialization.extend(percent_encode(component.as_bytes(), PATH_SEGMENT_ENCODE_SET)); } - Ok(path) + Ok(()) } #[cfg(unix)] -fn file_url_path_to_pathbuf(path: &[String]) -> Result { +fn file_url_segments_to_pathbuf(segments: str::Split) -> Result { use std::ffi::OsStr; use std::os::unix::prelude::OsStrExt; use std::path::PathBuf; - use percent_encoding::percent_decode_to; - - if path.is_empty() { - return Ok(PathBuf::from("/")) - } let mut bytes = Vec::new(); - for path_part in path { + for segment in segments { bytes.push(b'/'); - percent_decode_to(path_part.as_bytes(), &mut bytes); + bytes.extend(percent_decode(segment.as_bytes())); } let os_str = OsStr::from_bytes(&bytes); let path = PathBuf::from(os_str); @@ -1131,29 +1328,24 @@ fn file_url_path_to_pathbuf(path: &[String]) -> Result { } #[cfg(windows)] -fn file_url_path_to_pathbuf(path: &[String]) -> Result { - file_url_path_to_pathbuf_windows(path) +fn file_url_segments_to_pathbuf(segments: str::Split) -> Result { + file_url_segments_to_pathbuf_windows(segments) } // Build this unconditionally to alleviate https://github.com/servo/rust-url/issues/102 #[cfg_attr(not(windows), allow(dead_code))] -fn file_url_path_to_pathbuf_windows(path: &[String]) -> Result { - use percent_encoding::percent_decode; - - if path.is_empty() { +fn file_url_segments_to_pathbuf_windows(mut segments: str::Split) -> Result { + let first = try!(segments.next().ok_or(())); + if first.len() != 2 || !first.starts_with(parser::ascii_alpha) + || first.as_bytes()[1] != b':' { return Err(()) } - let prefix = &*path[0]; - if prefix.len() != 2 || !parser::starts_with_ascii_alpha(prefix) - || prefix.as_bytes()[1] != b':' { - return Err(()) - } - let mut string = prefix.to_owned(); - for path_part in &path[1..] { + let mut string = first.to_owned(); + for segment in segments { string.push('\\'); // Currently non-unicode windows paths cannot be represented - match String::from_utf8(percent_decode(path_part.as_bytes())) { + match String::from_utf8(percent_decode(segment.as_bytes()).collect()) { Ok(s) => string.push_str(&s), Err(..) => return Err(()), } @@ -1163,3 +1355,19 @@ fn file_url_path_to_pathbuf_windows(path: &[String]) -> Result { "to_file_path() failed to produce an absolute Path"); Ok(path) } + +fn io_error(reason: &str) -> io::Result { + Err(io::Error::new(io::ErrorKind::InvalidData, reason)) +} + +/// Implementation detail of `Url::mutate_query_pairs`. Typically not used directly. +pub struct UrlQuery<'a> { + url: &'a mut Url, + fragment: Option, +} + +impl<'a> Drop for UrlQuery<'a> { + fn drop(&mut self) { + self.url.restore_already_parsed_fragment(self.fragment.take()) + } +} diff --git a/src/origin.rs b/src/origin.rs new file mode 100644 index 00000000..a78b939f --- /dev/null +++ b/src/origin.rs @@ -0,0 +1,99 @@ +// Copyright 2016 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use host::Host; +use idna::domain_to_unicode; +use parser::default_port; +use std::sync::atomic::{AtomicUsize, ATOMIC_USIZE_INIT, Ordering}; +use Url; + +pub fn url_origin(url: &Url) -> Origin { + let scheme = url.scheme(); + match scheme { + "blob" => { + let result = Url::parse(url.path()); + match result { + Ok(ref url) => url_origin(url), + Err(_) => Origin::new_opaque() + } + }, + "ftp" | "gopher" | "http" | "https" | "ws" | "wss" => { + Origin::Tuple(scheme.to_owned(), url.host().unwrap().to_owned(), + url.port_or_known_default().unwrap()) + }, + // TODO: Figure out what to do if the scheme is a file + "file" => Origin::new_opaque(), + _ => Origin::new_opaque() + } +} + +/// The origin of an URL +#[derive(PartialEq, Eq, Clone, Debug)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +pub enum Origin { + /// A globally unique identifier + Opaque(OpaqueOrigin), + + /// Consists of the URL's scheme, host and port + Tuple(String, Host, u16) +} + + +impl Origin { + /// Creates a new opaque origin that is only equal to itself. + pub fn new_opaque() -> Origin { + static COUNTER: AtomicUsize = ATOMIC_USIZE_INIT; + Origin::Opaque(OpaqueOrigin(COUNTER.fetch_add(1, Ordering::SeqCst))) + } + + /// Return whether this origin is a (scheme, host, port) tuple + /// (as opposed to an opaque origin). + pub fn is_tuple(&self) -> bool { + matches!(*self, Origin::Tuple(..)) + } + + /// https://html.spec.whatwg.org/multipage/#ascii-serialisation-of-an-origin + pub fn ascii_serialization(&self) -> String { + match *self { + Origin::Opaque(_) => "null".to_owned(), + Origin::Tuple(ref scheme, ref host, port) => { + if default_port(scheme) == Some(port) { + format!("{}://{}", scheme, host) + } else { + format!("{}://{}:{}", scheme, host, port) + } + } + } + } + + /// https://html.spec.whatwg.org/multipage/#unicode-serialisation-of-an-origin + pub fn unicode_serialization(&self) -> String { + match *self { + Origin::Opaque(_) => "null".to_owned(), + Origin::Tuple(ref scheme, ref host, port) => { + let host = match *host { + Host::Domain(ref domain) => { + let (domain, _errors) = domain_to_unicode(domain); + Host::Domain(domain) + } + _ => host.clone() + }; + if default_port(scheme) == Some(port) { + format!("{}://{}", scheme, host) + } else { + format!("{}://{}:{}", scheme, host, port) + } + } + } + } +} + +/// Opaque identifier for URLs that have file or other schemes +#[derive(Eq, PartialEq, Clone, Debug)] +#[cfg_attr(feature="heap_size", derive(HeapSizeOf))] +pub struct OpaqueOrigin(usize); diff --git a/src/parser.rs b/src/parser.rs index ae8182dd..39879de5 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -7,20 +7,20 @@ // except according to those terms. use std::ascii::AsciiExt; -use std::cmp::max; use std::error::Error; -use std::fmt::{self, Formatter}; +use std::fmt::{self, Formatter, Write}; -use super::{UrlParser, Url, SchemeData, RelativeSchemeData, Host, SchemeType}; +use Url; +use encoding::EncodingOverride; +use host::{Host, HostInternal}; use percent_encoding::{ - utf8_percent_encode_to, percent_encode, - SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET + utf8_percent_encode, percent_encode, + SIMPLE_ENCODE_SET, DEFAULT_ENCODE_SET, USERINFO_ENCODE_SET, QUERY_ENCODE_SET, + PATH_SEGMENT_ENCODE_SET }; - pub type ParseResult = Result; - macro_rules! simple_enum_error { ($($name: ident => $description: expr,)+) => { /// Errors that can occur during parsing. @@ -45,30 +45,15 @@ macro_rules! simple_enum_error { simple_enum_error! { EmptyHost => "empty host", - InvalidScheme => "invalid scheme", + IdnaError => "invalid international domain name", InvalidPort => "invalid port number", InvalidIpv4Address => "invalid IPv4 address", InvalidIpv6Address => "invalid IPv6 address", InvalidDomainCharacter => "invalid domain character", - InvalidCharacter => "invalid character", - InvalidBackslash => "invalid backslash", - InvalidPercentEncoded => "invalid percent-encoded sequence", - InvalidAtSymbolInUser => "invalid @-symbol in user", - ExpectedTwoSlashes => "expected two slashes (//)", - ExpectedInitialSlash => "expected the input to start with a slash", - NonUrlCodePoint => "non URL code point", - RelativeUrlWithScheme => "relative URL with scheme", RelativeUrlWithoutBase => "relative URL without a base", - RelativeUrlWithNonRelativeBase => "relative URL with a non-relative base", - NonAsciiDomainsNotSupportedYet => "non-ASCII domains are not supported yet", - CannotSetJavascriptFragment => "cannot set fragment on javascript: URL", - CannotSetPortWithFileLikeScheme => "cannot set port with file-like scheme", - CannotSetUsernameWithNonRelativeScheme => "cannot set username with non-relative scheme", - CannotSetPasswordWithNonRelativeScheme => "cannot set password with non-relative scheme", - CannotSetHostPortWithNonRelativeScheme => "cannot set host and port with non-relative scheme", - CannotSetHostWithNonRelativeScheme => "cannot set host with non-relative scheme", - CannotSetPortWithNonRelativeScheme => "cannot set port with non-relative scheme", - CannotSetPathWithNonRelativeScheme => "cannot set path with non-relative scheme", + RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base", + SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set", + Overflow => "URLs more than 4 GB are not supported", } impl fmt::Display for ParseError { @@ -77,589 +62,941 @@ impl fmt::Display for ParseError { } } -/// This is called on non-fatal parse errors. -/// -/// The handler can choose to continue or abort parsing by returning Ok() or Err(), respectively. -/// See the `UrlParser::error_handler` method. -/// -/// FIXME: make this a by-ref closure when that’s supported. -pub type ErrorHandler = fn(reason: ParseError) -> ParseResult<()>; - - -#[derive(PartialEq, Eq)] -pub enum Context { - UrlParser, - Setter, +impl From<::idna::uts46::Errors> for ParseError { + fn from(_: ::idna::uts46::Errors) -> ParseError { ParseError::IdnaError } } - -pub fn parse_url(input: &str, parser: &UrlParser) -> ParseResult { - let input = input.trim_matches(&[' ', '\t', '\n', '\r', '\x0C'][..]); - let (scheme, remaining) = match parse_scheme(input, Context::UrlParser) { - Some((scheme, remaining)) => (scheme, remaining), - // No-scheme state - None => return match parser.base_url { - Some(&Url { ref scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) => { - let scheme_type = parser.get_scheme_type(&scheme); - parse_relative_url(input, scheme.clone(), scheme_type, base, query, parser) - }, - Some(_) => Err(ParseError::RelativeUrlWithNonRelativeBase), - None => Err(ParseError::RelativeUrlWithoutBase), - }, - }; - let scheme_type = parser.get_scheme_type(&scheme); - match scheme_type { - SchemeType::FileLike => { - // Relative state? - match parser.base_url { - Some(&Url { scheme: ref base_scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) - if scheme == *base_scheme => { - parse_relative_url(remaining, scheme, scheme_type, base, query, parser) - }, - // FIXME: Should not have to use a made-up base URL. - _ => parse_relative_url(remaining, scheme, scheme_type, &RelativeSchemeData { - username: String::new(), password: None, host: Host::Domain(String::new()), - port: None, default_port: None, path: Vec::new() - }, &None, parser) - } - }, - SchemeType::Relative(..) => { - match parser.base_url { - Some(&Url { scheme: ref base_scheme, scheme_data: SchemeData::Relative(ref base), - ref query, .. }) - if scheme == *base_scheme && !remaining.starts_with("//") => { - try!(parser.parse_error(ParseError::RelativeUrlWithScheme)); - parse_relative_url(remaining, scheme, scheme_type, base, query, parser) - }, - _ => parse_absolute_url(scheme, scheme_type, remaining, parser), - } - }, - SchemeType::NonRelative => { - // Scheme data state - let (scheme_data, remaining) = try!(parse_scheme_data(remaining, parser)); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: SchemeData::NonRelative(scheme_data), - query: query, fragment: fragment }) - } - } +#[derive(Copy, Clone)] +pub enum SchemeType { + File, + SpecialNotFile, + NotSpecial, } +impl SchemeType { + pub fn is_special(&self) -> bool { + !matches!(*self, SchemeType::NotSpecial) + } -pub fn parse_scheme(input: &str, context: Context) -> Option<(String, &str)> { - if input.is_empty() || !starts_with_ascii_alpha(input) { - return None + pub fn is_file(&self) -> bool { + matches!(*self, SchemeType::File) } - for (i, c) in input.char_indices() { - match c { - 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.' => (), - ':' => return Some(( - input[..i].to_ascii_lowercase(), - &input[i + 1..], - )), - _ => return None, + + pub fn from(s: &str) -> Self { + match s { + "http" | "https" | "ws" | "wss" | "ftp" | "gopher" => SchemeType::SpecialNotFile, + "file" => SchemeType::File, + _ => SchemeType::NotSpecial, } } - // EOF before ':' - match context { - Context::Setter => Some((input.to_ascii_lowercase(), "")), - Context::UrlParser => None - } } +pub fn default_port(scheme: &str) -> Option { + match scheme { + "http" | "ws" => Some(80), + "https" | "wss" => Some(443), + "ftp" => Some(21), + "gopher" => Some(70), + _ => None, + } +} -fn parse_absolute_url<'a>(scheme: String, scheme_type: SchemeType, - input: &'a str, parser: &UrlParser) -> ParseResult { - // Authority first slash state - let remaining = try!(skip_slashes(input, parser)); - // Authority state - let (username, password, remaining) = try!(parse_userinfo(remaining, parser)); - // Host state - let (host, port, default_port, remaining) = try!(parse_host(remaining, scheme_type, parser)); - let (path, remaining) = try!(parse_path_start( - remaining, Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(RelativeSchemeData { - username: username, password: password, - host: host, port: port, default_port: default_port, - path: path }); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, query: query, fragment: fragment }) +pub struct Parser<'a> { + pub serialization: String, + pub base_url: Option<&'a Url>, + pub query_encoding_override: EncodingOverride, + pub log_syntax_violation: Option<&'a Fn(&'static str)>, + pub context: Context, } +#[derive(PartialEq, Eq, Copy, Clone)] +pub enum Context { + UrlParser, + Setter, + PathSegmentSetter, +} -fn parse_relative_url<'a>(input: &'a str, scheme: String, scheme_type: SchemeType, - base: &RelativeSchemeData, base_query: &Option, - parser: &UrlParser) - -> ParseResult { - let mut chars = input.chars(); - match chars.next() { - Some('/') | Some('\\') => { - let ch = chars.next(); - // Relative slash state - if matches!(ch, Some('/') | Some('\\')) { - if ch == Some('\\') { - try!(parser.parse_error(ParseError::InvalidBackslash)) - } - if scheme_type == SchemeType::FileLike { - // File host state - let remaining = &input[2..]; - let (host, remaining) = if remaining.len() >= 2 - && starts_with_ascii_alpha(remaining) - && matches!(remaining.as_bytes()[1], b':' | b'|') - && (remaining.len() == 2 - || matches!(remaining.as_bytes()[2], - b'/' | b'\\' | b'?' | b'#')) - { - // Windows drive letter quirk - (Host::Domain(String::new()), remaining) - } else { - try!(parse_file_host(remaining, parser)) - }; - let (path, remaining) = try!(parse_path_start( - remaining, Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(RelativeSchemeData { - username: String::new(), password: None, - host: host, port: None, default_port: None, path: path - }); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) - } else { - parse_absolute_url(scheme, scheme_type, input, parser) - } - } else { - // Relative path state - let (path, remaining) = try!(parse_path( - &[], &input[1..], Context::UrlParser, scheme_type, parser)); - let scheme_data = SchemeData::Relative(if scheme_type == SchemeType::FileLike { - RelativeSchemeData { - username: String::new(), password: None, host: - Host::Domain(String::new()), port: None, default_port: None, path: path - } - } else { - RelativeSchemeData { - username: base.username.clone(), - password: base.password.clone(), - host: base.host.clone(), - port: base.port.clone(), - default_port: base.default_port.clone(), - path: path - } - }); - let (query, fragment) = try!( - parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) - } - }, - Some('?') => { - let (query, fragment) = try!(parse_query_and_fragment(input, parser)); - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: query, fragment: fragment }) - }, - Some('#') => { - let fragment = Some(try!(parse_fragment(&input[1..], parser))); - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: base_query.clone(), fragment: fragment }) - } - None => { - Ok(Url { scheme: scheme, scheme_data: SchemeData::Relative(base.clone()), - query: base_query.clone(), fragment: None }) - } - _ => { - let (scheme_data, remaining) = if scheme_type == SchemeType::FileLike - && input.len() >= 2 - && starts_with_ascii_alpha(input) - && matches!(input.as_bytes()[1], b':' | b'|') - && (input.len() == 2 - || matches!(input.as_bytes()[2], b'/' | b'\\' | b'?' | b'#')) - { - // Windows drive letter quirk - let (path, remaining) = try!(parse_path( - &[], input, Context::UrlParser, scheme_type, parser)); - (SchemeData::Relative(RelativeSchemeData { - username: String::new(), password: None, - host: Host::Domain(String::new()), - port: None, - default_port: None, - path: path - }), remaining) - } else { - let base_path = &base.path[..max(base.path.len(), 1) - 1]; - // Relative path state - let (path, remaining) = try!(parse_path( - base_path, input, Context::UrlParser, scheme_type, parser)); - (SchemeData::Relative(RelativeSchemeData { - username: base.username.clone(), - password: base.password.clone(), - host: base.host.clone(), - port: base.port.clone(), - default_port: base.default_port.clone(), - path: path - }), remaining) - }; - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok(Url { scheme: scheme, scheme_data: scheme_data, - query: query, fragment: fragment }) +impl<'a> Parser<'a> { + pub fn for_setter(serialization: String) -> Parser<'a> { + Parser { + serialization: serialization, + base_url: None, + query_encoding_override: EncodingOverride::utf8(), + log_syntax_violation: None, + context: Context::Setter, } } -} + fn syntax_violation(&self, reason: &'static str) { + if let Some(log) = self.log_syntax_violation { + log(reason) + } + } -fn skip_slashes<'a>(input: &'a str, parser: &UrlParser) -> ParseResult<&'a str> { - let first_non_slash = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); - if &input[..first_non_slash] != "//" { - try!(parser.parse_error(ParseError::ExpectedTwoSlashes)); + fn syntax_violation_if bool>(&self, reason: &'static str, test: F) { + // Skip test if not logging. + if let Some(log) = self.log_syntax_violation { + if test() { + log(reason) + } + } } - Ok(&input[first_non_slash..]) -} + /// https://url.spec.whatwg.org/#concept-basic-url-parser + pub fn parse_url(mut self, original_input: &str) -> ParseResult { + let input = original_input.trim_matches(c0_control_or_space); + if input.len() < original_input.len() { + self.syntax_violation("leading or trailing control or space character") + } + if let Ok(remaining) = self.parse_scheme(input) { + return self.parse_with_scheme(remaining) + } -fn parse_userinfo<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(String, Option, &'a str)> { - let mut last_at = None; - for (i, c) in input.char_indices() { - match c { - '@' => { - if last_at.is_some() { - try!(parser.parse_error(ParseError::InvalidAtSymbolInUser)) + // No-scheme state + if let Some(base_url) = self.base_url { + if input.starts_with("#") { + self.fragment_only(base_url, input) + } else if base_url.cannot_be_a_base() { + Err(ParseError::RelativeUrlWithCannotBeABaseBase) + } else { + let scheme_type = SchemeType::from(base_url.scheme()); + if scheme_type.is_file() { + self.parse_file(input, Some(base_url)) + } else { + self.parse_relative(input, scheme_type, base_url) } - last_at = Some(i) - }, - '/' | '\\' | '?' | '#' => break, - _ => (), + } + } else { + Err(ParseError::RelativeUrlWithoutBase) } } - let (input, remaining) = match last_at { - Some(at) => (&input[..at], &input[at + 1..]), - None => return Ok((String::new(), None, input)), - }; - let mut username = String::new(); - let mut password = None; - for (i, c, next_i) in input.char_ranges() { - match c { - ':' => { - password = Some(try!(parse_password(&input[i + 1..], parser))); - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - // The spec says to use the default encode set, - // but also replaces '@' by '%40' in an earlier step. - utf8_percent_encode_to(&input[i..next_i], - USERINFO_ENCODE_SET, &mut username); + pub fn parse_scheme<'i>(&mut self, input: &'i str) -> Result<&'i str, ()> { + if input.is_empty() || !input.starts_with(ascii_alpha) { + return Err(()) + } + debug_assert!(self.serialization.is_empty()); + for (i, c) in input.char_indices() { + match c { + 'a'...'z' | 'A'...'Z' | '0'...'9' | '+' | '-' | '.' => { + self.serialization.push(c.to_ascii_lowercase()) + } + ':' => return Ok(&input[i + 1..]), + _ => { + self.serialization.clear(); + return Err(()) + } } } + // EOF before ':' + if self.context == Context::Setter { + Ok("") + } else { + self.serialization.clear(); + Err(()) + } } - Ok((username, password, remaining)) -} - -fn parse_password(input: &str, parser: &UrlParser) -> ParseResult { - let mut password = String::new(); - for (i, c, next_i) in input.char_ranges() { - match c { - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - // The spec says to use the default encode set, - // but also replaces '@' by '%40' in an earlier step. - utf8_percent_encode_to(&input[i..next_i], - USERINFO_ENCODE_SET, &mut password); + fn parse_with_scheme(mut self, input: &str) -> ParseResult { + let scheme_end = try!(to_u32(self.serialization.len())); + let scheme_type = SchemeType::from(&self.serialization); + self.serialization.push(':'); + match scheme_type { + SchemeType::File => { + self.syntax_violation_if("expected // after file:", || !input.starts_with("//")); + let base_file_url = self.base_url.and_then(|base| { + if base.scheme() == "file" { Some(base) } else { None } + }); + self.serialization.clear(); + self.parse_file(input, base_file_url) } + SchemeType::SpecialNotFile => { + // special relative or authority state + let slashes_count = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); + if let Some(base_url) = self.base_url { + if slashes_count < 2 && + base_url.scheme() == &self.serialization[..scheme_end as usize] { + // "Cannot-be-a-base" URLs only happen with "not special" schemes. + debug_assert!(!base_url.cannot_be_a_base()); + self.serialization.clear(); + return self.parse_relative(input, scheme_type, base_url) + } + } + // special authority slashes state + self.syntax_violation_if("expected //", || &input[..slashes_count] != "//"); + self.after_double_slash(&input[slashes_count..], scheme_type, scheme_end) + } + SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end) } } - Ok(password) -} - - -pub fn parse_host<'a>(input: &'a str, scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Host, Option, Option, &'a str)> { - let (host, remaining) = try!(parse_hostname(input, parser)); - let (port, default_port, remaining) = if remaining.starts_with(":") { - try!(parse_port(&remaining[1..], scheme_type, parser)) - } else { - (None, scheme_type.default_port(), remaining) - }; - Ok((host, port, default_port, remaining)) -} + /// Scheme other than file, http, https, ws, ws, ftp, gopher. + fn parse_non_special(mut self, input: &str, scheme_type: SchemeType, scheme_end: u32) + -> ParseResult { + // path or authority state ( + if input.starts_with("//") { + return self.after_double_slash(&input[2..], scheme_type, scheme_end) + } + // Anarchist URL (no authority) + let path_start = try!(to_u32(self.serialization.len())); + let username_end = path_start; + let host_start = path_start; + let host_end = path_start; + let host = HostInternal::None; + let port = None; + let remaining = if input.starts_with("/") { + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(scheme_type, &mut false, path_start, &input[1..]) + } else { + self.parse_cannot_be_a_base_path(input) + }; + self.with_query_and_fragment(scheme_end, username_end, host_start, + host_end, host, port, path_start, remaining) + } -pub fn parse_hostname<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(Host, &'a str)> { - let mut inside_square_brackets = false; - let mut host_input = String::new(); - let mut end = input.len(); - for (i, c) in input.char_indices() { + fn parse_file(mut self, input: &str, mut base_file_url: Option<&Url>) -> ParseResult { + // file state + debug_assert!(self.serialization.is_empty()); + let c = input.chars().next(); match c { - ':' if !inside_square_brackets => { - end = i; - break + None => { + if let Some(base_url) = base_file_url { + // Copy everything except the fragment + let before_fragment = match base_url.fragment_start { + Some(i) => &base_url.serialization[..i as usize], + None => &*base_url.serialization, + }; + self.serialization.push_str(before_fragment); + Ok(Url { + serialization: self.serialization, + fragment_start: None, + ..*base_url + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: None, + }) + } }, - '/' | '\\' | '?' | '#' => { - end = i; - break + Some('?') => { + if let Some(base_url) = base_file_url { + // Copy everything up to the query string + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(base_url.scheme_end, input)); + Ok(Url { + serialization: self.serialization, + query_start: query_start, + fragment_start: fragment_start, + ..*base_url + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, input)); + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) + } }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - c => { - match c { - '[' => inside_square_brackets = true, - ']' => inside_square_brackets = false, - _ => (), + Some('#') => { + if let Some(base_url) = base_file_url { + self.fragment_only(base_url, input) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len() as u32; + let fragment_start = "file:///".len() as u32; + self.parse_fragment(&input[1..]); + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: None, + fragment_start: Some(fragment_start), + }) + } + } + Some('/') | Some('\\') => { + self.syntax_violation_if("backslash", || c == Some('\\')); + let input = &input[1..]; + // file slash state + let c = input.chars().next(); + self.syntax_violation_if("backslash", || c == Some('\\')); + if matches!(c, Some('/') | Some('\\')) { + // file host state + self.serialization.push_str("file://"); + let scheme_end = "file".len() as u32; + let host_start = "file://".len() as u32; + let (path_start, host, remaining) = try!(self.parse_file_host(&input[1..])); + let host_end = try!(to_u32(self.serialization.len())); + let mut has_host = !matches!(host, HostInternal::None); + let remaining = if path_start { + self.parse_path_start(SchemeType::File, &mut has_host, remaining) + } else { + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(SchemeType::File, &mut has_host, path_start, remaining) + }; + // FIXME: deal with has_host + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: host_start, + host_start: host_start, + host_end: host_end, + host: host, + port: None, + path_start: host_end, + query_start: query_start, + fragment_start: fragment_start, + }) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + if let Some(base_url) = base_file_url { + let first_segment = base_url.path_segments().unwrap().next().unwrap(); + // FIXME: *normalized* drive letter + if is_windows_drive_letter(first_segment) { + self.serialization.push_str(first_segment); + self.serialization.push('/'); + } + } + let remaining = self.parse_path( + SchemeType::File, &mut false, path_start, input); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) + } + } + _ => { + if starts_with_windows_drive_letter_segment(input) { + base_file_url = None; + } + if let Some(base_url) = base_file_url { + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + self.pop_path(SchemeType::File, base_url.path_start as usize); + let remaining = self.parse_path( + SchemeType::File, &mut true, base_url.path_start as usize, input); + self.with_query_and_fragment( + base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } else { + self.serialization.push_str("file:///"); + let scheme_end = "file".len() as u32; + let path_start = "file://".len(); + let remaining = self.parse_path( + SchemeType::File, &mut false, path_start, input); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + let path_start = path_start as u32; + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: path_start, + host_start: path_start, + host_end: path_start, + host: HostInternal::None, + port: None, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start, + }) } - host_input.push(c) } } } - let host = try!(Host::parse(&host_input)); - Ok((host, &input[end..])) -} - -pub fn parse_port<'a>(input: &'a str, scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Option, Option, &'a str)> { - let mut port = 0; - let mut has_any_digit = false; - let mut end = input.len(); - for (i, c) in input.char_indices() { - match c { - '0'...'9' => { - port = port * 10 + (c as u32 - '0' as u32); - if port > ::std::u16::MAX as u32 { - return Err(ParseError::InvalidPort) - } - has_any_digit = true; + fn parse_relative(mut self, input: &str, scheme_type: SchemeType, base_url: &Url) + -> ParseResult { + // relative state + debug_assert!(self.serialization.is_empty()); + match input.chars().next() { + None => { + // Copy everything except the fragment + let before_fragment = match base_url.fragment_start { + Some(i) => &base_url.serialization[..i as usize], + None => &*base_url.serialization, + }; + self.serialization.push_str(before_fragment); + Ok(Url { + serialization: self.serialization, + fragment_start: None, + ..*base_url + }) }, - '/' | '\\' | '?' | '#' => { - end = i; - break + Some('?') => { + // Copy everything up to the query string + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(base_url.scheme_end, input)); + Ok(Url { + serialization: self.serialization, + query_start: query_start, + fragment_start: fragment_start, + ..*base_url + }) }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => return Err(ParseError::InvalidPort) + Some('#') => self.fragment_only(base_url, input), + Some('/') | Some('\\') => { + let slashes_count = input.find(|c| !matches!(c, '/' | '\\')).unwrap_or(input.len()); + if slashes_count >= 2 { + self.syntax_violation_if("expected //", || &input[..slashes_count] != "//"); + let scheme_end = base_url.scheme_end; + debug_assert!(base_url.byte_at(scheme_end) == b':'); + self.serialization.push_str(base_url.slice(..scheme_end + 1)); + return self.after_double_slash(&input[slashes_count..], scheme_type, scheme_end) + } + let path_start = base_url.path_start; + debug_assert!(base_url.byte_at(path_start) == b'/'); + self.serialization.push_str(base_url.slice(..path_start + 1)); + let remaining = self.parse_path( + scheme_type, &mut true, path_start as usize, &input[1..]); + self.with_query_and_fragment( + base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } + _ => { + let before_query = match (base_url.query_start, base_url.fragment_start) { + (None, None) => &*base_url.serialization, + (Some(i), _) | + (None, Some(i)) => base_url.slice(..i) + }; + self.serialization.push_str(before_query); + // FIXME spec says just "remove last entry", not the "pop" algorithm + self.pop_path(scheme_type, base_url.path_start as usize); + let remaining = self.parse_path( + scheme_type, &mut true, base_url.path_start as usize, input); + self.with_query_and_fragment( + base_url.scheme_end, base_url.username_end, base_url.host_start, + base_url.host_end, base_url.host, base_url.port, base_url.path_start, remaining) + } } } - let default_port = scheme_type.default_port(); - let mut port = Some(port as u16); - if !has_any_digit || port == default_port { - port = None; - } - Ok((port, default_port, &input[end..])) -} + fn after_double_slash(mut self, input: &str, scheme_type: SchemeType, scheme_end: u32) + -> ParseResult { + self.serialization.push('/'); + self.serialization.push('/'); + // authority state + let (username_end, remaining) = try!(self.parse_userinfo(input, scheme_type)); + // host state + let host_start = try!(to_u32(self.serialization.len())); + let (host_end, host, port, remaining) = + try!(self.parse_host_and_port(remaining, scheme_end, scheme_type)); + // path state + let path_start = try!(to_u32(self.serialization.len())); + let remaining = self.parse_path_start( + scheme_type, &mut true, remaining); + self.with_query_and_fragment(scheme_end, username_end, host_start, + host_end, host, port, path_start, remaining) + } -fn parse_file_host<'a>(input: &'a str, parser: &UrlParser) -> ParseResult<(Host, &'a str)> { - let mut host_input = String::new(); - let mut end = input.len(); - for (i, c) in input.char_indices() { - match c { - '/' | '\\' | '?' | '#' => { - end = i; - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => host_input.push(c) + /// Return (username_end, remaining) + fn parse_userinfo<'i>(&mut self, input: &'i str, scheme_type: SchemeType) + -> ParseResult<(u32, &'i str)> { + let mut last_at = None; + for (i, c) in input.char_indices() { + match c { + '@' => { + if last_at.is_some() { + self.syntax_violation("unencoded @ sign in username or password") + } else { + self.syntax_violation( + "embedding authentification information (username or password) \ + in an URL is not recommended") + } + last_at = Some(i) + }, + '/' | '?' | '#' => break, + '\\' if scheme_type.is_special() => break, + _ => (), + } + } + let (input, remaining) = match last_at { + None => return Ok((try!(to_u32(self.serialization.len())), input)), + Some(0) => return Ok((try!(to_u32(self.serialization.len())), &input[1..])), + Some(at) => (&input[..at], &input[at + 1..]), + }; + + let mut username_end = None; + for (i, c, next_i) in input.char_ranges() { + match c { + ':' if username_end.is_none() => { + // Start parsing password + username_end = Some(try!(to_u32(self.serialization.len()))); + self.serialization.push(':'); + }, + '\t' | '\n' | '\r' => {}, + _ => { + self.check_url_code_point(input, i, c); + let utf8_c = &input[i..next_i]; + self.serialization.extend(utf8_percent_encode(utf8_c, USERINFO_ENCODE_SET)); + } + } } + let username_end = match username_end { + Some(i) => i, + None => try!(to_u32(self.serialization.len())), + }; + self.serialization.push('@'); + Ok((username_end, remaining)) } - let host = if host_input.is_empty() { - Host::Domain(String::new()) - } else { - try!(Host::parse(&host_input)) - }; - Ok((host, &input[end..])) -} - -pub fn parse_standalone_path(input: &str, parser: &UrlParser) - -> ParseResult<(Vec, Option, Option)> { - if !input.starts_with("/") { - if input.starts_with("\\") { - try!(parser.parse_error(ParseError::InvalidBackslash)); + fn parse_host_and_port<'i>(&mut self, input: &'i str, + scheme_end: u32, scheme_type: SchemeType) + -> ParseResult<(u32, HostInternal, Option, &'i str)> { + let (host, remaining) = try!( + Parser::parse_host(input, scheme_type, |m| self.syntax_violation(m))); + write!(&mut self.serialization, "{}", host).unwrap(); + let host_end = try!(to_u32(self.serialization.len())); + let (port, remaining) = if remaining.starts_with(":") { + let syntax_violation = |message| self.syntax_violation(message); + let scheme = || default_port(&self.serialization[..scheme_end as usize]); + try!(Parser::parse_port(&remaining[1..], syntax_violation, scheme, self.context)) } else { - return Err(ParseError::ExpectedInitialSlash) + (None, remaining) + }; + if let Some(port) = port { + write!(&mut self.serialization, ":{}", port).unwrap() } + Ok((host_end, host.into(), port, remaining)) } - let (path, remaining) = try!(parse_path( - &[], &input[1..], Context::UrlParser, SchemeType::Relative(0), parser)); - let (query, fragment) = try!(parse_query_and_fragment(remaining, parser)); - Ok((path, query, fragment)) -} - -pub fn parse_path_start<'a>(input: &'a str, context: Context, scheme_type: SchemeType, - parser: &UrlParser) - -> ParseResult<(Vec, &'a str)> { - let mut i = 0; - // Relative path start state - match input.chars().next() { - Some('/') => i = 1, - Some('\\') => { - try!(parser.parse_error(ParseError::InvalidBackslash)); - i = 1; - }, - _ => () - } - parse_path(&[], &input[i..], context, scheme_type, parser) -} - - -fn parse_path<'a>(base_path: &[String], input: &'a str, context: Context, - scheme_type: SchemeType, parser: &UrlParser) - -> ParseResult<(Vec, &'a str)> { - // Relative path state - let mut path = base_path.to_vec(); - let mut iter = input.char_ranges(); - let mut end; - loop { - let mut path_part = String::new(); - let mut ends_with_slash = false; - end = input.len(); - while let Some((i, c, next_i)) = iter.next() { - match c { - '/' => { - ends_with_slash = true; + pub fn parse_host<'i, S>(input: &'i str, scheme_type: SchemeType, syntax_violation: S) + -> ParseResult<(Host, &'i str)> + where S: Fn(&'static str) { + let mut inside_square_brackets = false; + let mut has_ignored_chars = false; + let mut end = input.len(); + for (i, b) in input.bytes().enumerate() { + match b { + b':' if !inside_square_brackets => { end = i; break }, - '\\' => { - try!(parser.parse_error(ParseError::InvalidBackslash)); - ends_with_slash = true; + b'/' | b'?' | b'#' => { end = i; break - }, - '?' | '#' if context == Context::UrlParser => { + } + b'\\' if scheme_type.is_special() => { end = i; break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - DEFAULT_ENCODE_SET, &mut path_part); } + b'\t' | b'\n' | b'\r' => { + syntax_violation("invalid character"); + has_ignored_chars = true; + } + b'[' => inside_square_brackets = true, + b']' => inside_square_brackets = false, + _ => {} } } - match &*path_part { - ".." | ".%2e" | ".%2E" | "%2e." | "%2E." | - "%2e%2e" | "%2E%2e" | "%2e%2E" | "%2E%2E" => { - path.pop(); - if !ends_with_slash { - path.push(String::new()); + let replaced: String; + let host_input = if has_ignored_chars { + replaced = input[..end].chars().filter(|&c| !matches!(c, '\t' | '\n' | '\r')).collect(); + &*replaced + } else { + &input[..end] + }; + if scheme_type.is_special() && host_input.is_empty() { + return Err(ParseError::EmptyHost) + } + let host = try!(Host::parse(&host_input)); + Ok((host, &input[end..])) + } + + pub fn parse_file_host<'i>(&mut self, input: &'i str) + -> ParseResult<(bool, HostInternal, &'i str)> { + let mut has_ignored_chars = false; + let mut end = input.len(); + for (i, b) in input.bytes().enumerate() { + match b { + b'/' | b'\\' | b'?' | b'#' => { + end = i; + break } - }, - "." | "%2e" | "%2E" => { - if !ends_with_slash { - path.push(String::new()); + b'\t' | b'\n' | b'\r' => { + self.syntax_violation("invalid character"); + has_ignored_chars = true; } - }, - _ => { - if scheme_type == SchemeType::FileLike - && path.is_empty() - && path_part.len() == 2 - && starts_with_ascii_alpha(&path_part) - && path_part.as_bytes()[1] == b'|' { - // Windows drive letter quirk - unsafe { - path_part.as_mut_vec()[1] = b':' + _ => {} + } + } + let replaced: String; + let host_input = if has_ignored_chars { + replaced = input[..end].chars().filter(|&c| !matches!(c, '\t' | '\n' | '\r')).collect(); + &*replaced + } else { + &input[..end] + }; + if is_windows_drive_letter(host_input) { + return Ok((false, HostInternal::None, input)) + } + let host = if host_input.is_empty() { + HostInternal::None + } else { + match try!(Host::parse(&host_input)) { + Host::Domain(ref d) if d == "localhost" => HostInternal::None, + host => { + write!(&mut self.serialization, "{}", host).unwrap(); + host.into() + } + } + }; + Ok((true, host, &input[end..])) + } + + pub fn parse_port<'i, V, P>(input: &'i str, syntax_violation: V, default_port: P, + context: Context) + -> ParseResult<(Option, &'i str)> + where V: Fn(&'static str), P: Fn() -> Option { + let mut port: u32 = 0; + let mut has_any_digit = false; + let mut end = input.len(); + for (i, c) in input.char_indices() { + if let Some(digit) = c.to_digit(10) { + port = port * 10 + digit; + if port > ::std::u16::MAX as u32 { + return Err(ParseError::InvalidPort) + } + has_any_digit = true; + } else { + match c { + '\t' | '\n' | '\r' => { + syntax_violation("invalid character"); + continue + } + '/' | '\\' | '?' | '#' => {} + _ => if context == Context::UrlParser { + return Err(ParseError::InvalidPort) } } - path.push(path_part) + end = i; + break } } - if !ends_with_slash { - break + let mut opt_port = Some(port as u16); + if !has_any_digit || opt_port == default_port() { + opt_port = None; } + return Ok((opt_port, &input[end..])) } - Ok((path, &input[end..])) -} + pub fn parse_path_start<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + mut input: &'i str) + -> &'i str { + // Path start state + let mut iter = input.chars(); + match iter.next() { + Some('/') => input = iter.as_str(), + Some('\\') if scheme_type.is_special() => { + self.syntax_violation("backslash"); + input = iter.as_str() + } + _ => {} + } + let path_start = self.serialization.len(); + self.serialization.push('/'); + self.parse_path(scheme_type, has_host, path_start, input) + } -fn parse_scheme_data<'a>(input: &'a str, parser: &UrlParser) - -> ParseResult<(String, &'a str)> { - let mut scheme_data = String::new(); - let mut end = input.len(); - for (i, c, next_i) in input.char_ranges() { - match c { - '?' | '#' => { - end = i; + pub fn parse_path<'i>(&mut self, scheme_type: SchemeType, has_host: &mut bool, + path_start: usize, input: &'i str) + -> &'i str { + // Relative path state + debug_assert!(self.serialization.ends_with("/")); + let mut iter = input.char_ranges(); + let mut end; + loop { + let segment_start = self.serialization.len(); + let mut ends_with_slash = false; + end = input.len(); + while let Some((i, c, next_i)) = iter.next() { + match c { + '/' if self.context != Context::PathSegmentSetter => { + ends_with_slash = true; + end = i; + break + }, + '\\' if self.context != Context::PathSegmentSetter && + scheme_type.is_special() => { + self.syntax_violation("backslash"); + ends_with_slash = true; + end = i; + break + }, + '?' | '#' if self.context == Context::UrlParser => { + end = i; + break + }, + '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), + _ => { + self.check_url_code_point(input, i, c); + if c == '%' { + let after_percent_sign = iter.clone(); + if matches!(iter.next(), Some((_, '2', _))) && + matches!(iter.next(), Some((_, 'E', _)) | Some((_, 'e', _))) { + self.serialization.push('.'); + continue + } + iter = after_percent_sign + } + if self.context == Context::PathSegmentSetter { + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], PATH_SEGMENT_ENCODE_SET)); + } else { + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], DEFAULT_ENCODE_SET)); + } + } + } + } + match &self.serialization[segment_start..] { + ".." => { + debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/'); + self.serialization.truncate(segment_start - 1); // Truncate "/.." + self.pop_path(scheme_type, path_start); + if !self.serialization[path_start..].ends_with("/") { + self.serialization.push('/') + } + }, + "." => { + self.serialization.truncate(segment_start); + }, + _ => { + if scheme_type.is_file() && is_windows_drive_letter( + &self.serialization[path_start + 1..] + ) { + if self.serialization.ends_with('|') { + self.serialization.pop(); + self.serialization.push(':'); + } + if *has_host { + self.syntax_violation("file: with host and Windows drive letter"); + *has_host = false; // FIXME account for this in callers + } + } + if ends_with_slash { + self.serialization.push('/') + } + } + } + if !ends_with_slash { break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - SIMPLE_ENCODE_SET, &mut scheme_data); } } + &input[end..] } - Ok((scheme_data, &input[end..])) -} + /// https://url.spec.whatwg.org/#pop-a-urls-path + fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) { + if self.serialization.len() > path_start { + let slash_position = self.serialization[path_start..].rfind('/').unwrap(); + // + 1 since rfind returns the position before the slash. + let segment_start = path_start + slash_position + 1; + // Don’t pop a Windows drive letter + // FIXME: *normalized* Windows drive letter + if !( + scheme_type.is_file() && + is_windows_drive_letter(&self.serialization[segment_start..]) + ) { + self.serialization.truncate(segment_start); + } + } -fn parse_query_and_fragment(input: &str, parser: &UrlParser) - -> ParseResult<(Option, Option)> { - match input.chars().next() { - Some('#') => Ok((None, Some(try!(parse_fragment(&input[1..], parser))))), - Some('?') => { - let (query, remaining) = try!(parse_query( - &input[1..], Context::UrlParser, parser)); - let fragment = match remaining { - Some(remaining) => Some(try!(parse_fragment(remaining, parser))), - None => None - }; - Ok((Some(query), fragment)) - }, - None => Ok((None, None)), - _ => panic!("Programming error. parse_query_and_fragment() should not \ - have been called with input \"{}\"", input) } -} - -pub fn parse_query<'a>(input: &'a str, context: Context, parser: &UrlParser) - -> ParseResult<(String, Option<&'a str>)> { - let mut query = String::new(); - let mut remaining = None; - for (i, c) in input.char_indices() { - match c { - '#' if context == Context::UrlParser => { - remaining = Some(&input[i + 1..]); - break - }, - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - query.push(c); + pub fn parse_cannot_be_a_base_path<'i>(&mut self, input: &'i str) -> &'i str { + for (i, c, next_i) in input.char_ranges() { + match c { + '?' | '#' if self.context == Context::UrlParser => return &input[i..], + '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + _ => { + self.check_url_code_point(input, i, c); + self.serialization.extend(utf8_percent_encode( + &input[i..next_i], SIMPLE_ENCODE_SET)); + } } } + "" } - let query_bytes = parser.query_encoding_override.encode(&query); - Ok((percent_encode(&query_bytes, QUERY_ENCODE_SET), remaining)) -} + fn with_query_and_fragment(mut self, scheme_end: u32, username_end: u32, + host_start: u32, host_end: u32, host: HostInternal, + port: Option, path_start: u32, remaining: &str) + -> ParseResult { + let (query_start, fragment_start) = + try!(self.parse_query_and_fragment(scheme_end, remaining)); + Ok(Url { + serialization: self.serialization, + scheme_end: scheme_end, + username_end: username_end, + host_start: host_start, + host_end: host_end, + host: host, + port: port, + path_start: path_start, + query_start: query_start, + fragment_start: fragment_start + }) + } + /// Return (query_start, fragment_start) + fn parse_query_and_fragment(&mut self, scheme_end: u32, mut input: &str) + -> ParseResult<(Option, Option)> { + let mut query_start = None; + match input.chars().next() { + Some('#') => {} + Some('?') => { + query_start = Some(try!(to_u32(self.serialization.len()))); + self.serialization.push('?'); + let remaining = self.parse_query(scheme_end, &input[1..]); + if let Some(remaining) = remaining { + input = remaining + } else { + return Ok((query_start, None)) + } + } + None => return Ok((None, None)), + _ => panic!("Programming error. parse_query_and_fragment() should not \ + have been called with input \"{}\"", input) + }; + + let fragment_start = try!(to_u32(self.serialization.len())); + self.serialization.push('#'); + debug_assert!(input.starts_with("#")); + self.parse_fragment(&input[1..]); + Ok((query_start, Some(fragment_start))) + } -pub fn parse_fragment<'a>(input: &'a str, parser: &UrlParser) -> ParseResult { - let mut fragment = String::new(); - for (i, c, next_i) in input.char_ranges() { - match c { - '\t' | '\n' | '\r' => try!(parser.parse_error(ParseError::InvalidCharacter)), - _ => { - try!(check_url_code_point(input, i, c, parser)); - utf8_percent_encode_to(&input[i..next_i], - SIMPLE_ENCODE_SET, &mut fragment); + pub fn parse_query<'i>(&mut self, scheme_end: u32, input: &'i str) + -> Option<&'i str> { + let mut query = String::new(); // FIXME: use a streaming decoder instead + let mut remaining = None; + for (i, c) in input.char_indices() { + match c { + '#' if self.context == Context::UrlParser => { + remaining = Some(&input[i..]); + break + }, + '\t' | '\n' | '\r' => self.syntax_violation("invalid characters"), + _ => { + self.check_url_code_point(input, i, c); + query.push(c); + } } } + + let encoding = match &self.serialization[..scheme_end as usize] { + "http" | "https" | "file" | "ftp" | "gopher" => self.query_encoding_override, + _ => EncodingOverride::utf8(), + }; + let query_bytes = encoding.encode(query.into()); + self.serialization.extend(percent_encode(&query_bytes, QUERY_ENCODE_SET)); + remaining } - Ok(fragment) -} + fn fragment_only(mut self, base_url: &Url, input: &str) -> ParseResult { + let before_fragment = match base_url.fragment_start { + Some(i) => base_url.slice(..i), + None => &*base_url.serialization, + }; + debug_assert!(self.serialization.is_empty()); + self.serialization.reserve(before_fragment.len() + input.len()); + self.serialization.push_str(before_fragment); + self.serialization.push('#'); + debug_assert!(input.starts_with("#")); + self.parse_fragment(&input[1..]); + Ok(Url { + serialization: self.serialization, + fragment_start: Some(try!(to_u32(before_fragment.len()))), + ..*base_url + }) + } -#[inline] -pub fn starts_with_ascii_alpha(string: &str) -> bool { - matches!(string.as_bytes()[0], b'a'...b'z' | b'A'...b'Z') + pub fn parse_fragment(&mut self, input: &str) { + for (i, c) in input.char_indices() { + match c { + '\0' | '\t' | '\n' | '\r' => self.syntax_violation("invalid character"), + _ => { + self.check_url_code_point(input, i, c); + self.serialization.push(c); // No percent-encoding here. + } + } + } + } + + fn check_url_code_point(&self, input: &str, i: usize, c: char) { + if let Some(log) = self.log_syntax_violation { + if c == '%' { + if !starts_with_2_hex(&input[i + 1..]) { + log("expected 2 hex digits after %") + } + } else if !is_url_code_point(c) { + log("non-URL code point") + } + } + } } #[inline] @@ -674,6 +1011,13 @@ fn starts_with_2_hex(input: &str) -> bool { && is_ascii_hex_digit(input.as_bytes()[1]) } +// Non URL code points: +// U+0000 to U+0020 (space) +// " # % < > [ \ ] ^ ` { | } +// U+007F to U+009F +// surrogates +// U+FDD0 to U+FDEF +// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex #[inline] fn is_url_code_point(c: char) -> bool { matches!(c, @@ -693,20 +1037,11 @@ fn is_url_code_point(c: char) -> bool { '\u{F0000}'...'\u{FFFFD}' | '\u{100000}'...'\u{10FFFD}') } -// Non URL code points: -// U+0000 to U+0020 (space) -// " # % < > [ \ ] ^ ` { | } -// U+007F to U+009F -// surrogates -// U+FDD0 to U+FDEF -// Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex - pub trait StrCharRanges<'a> { fn char_ranges(&self) -> CharRanges<'a>; } - impl<'a> StrCharRanges<'a> for &'a str { #[inline] fn char_ranges(&self) -> CharRanges<'a> { @@ -714,6 +1049,7 @@ impl<'a> StrCharRanges<'a> for &'a str { } } +#[derive(Clone)] pub struct CharRanges<'a> { slice: &'a str, position: usize, @@ -735,15 +1071,41 @@ impl<'a> Iterator for CharRanges<'a> { } } +/// https://url.spec.whatwg.org/#c0-controls-and-space #[inline] -fn check_url_code_point(input: &str, i: usize, c: char, parser: &UrlParser) - -> ParseResult<()> { - if c == '%' { - if !starts_with_2_hex(&input[i + 1..]) { - try!(parser.parse_error(ParseError::InvalidPercentEncoded)); - } - } else if !is_url_code_point(c) { - try!(parser.parse_error(ParseError::NonUrlCodePoint)); +fn c0_control_or_space(ch: char) -> bool { + ch <= ' ' // U+0000 to U+0020 +} + +/// https://url.spec.whatwg.org/#ascii-alpha +#[inline] +pub fn ascii_alpha(ch: char) -> bool { + matches!(ch, 'a'...'z' | 'A'...'Z') +} + +#[inline] +pub fn to_u32(i: usize) -> ParseResult { + if i <= ::std::u32::MAX as usize { + Ok(i as u32) + } else { + Err(ParseError::Overflow) } - Ok(()) +} + +/// Wether the scheme is file:, the path has a single segment, and that segment +/// is a Windows drive letter +fn is_windows_drive_letter(segment: &str) -> bool { + segment.len() == 2 + && starts_with_windows_drive_letter(segment) +} + +fn starts_with_windows_drive_letter(s: &str) -> bool { + ascii_alpha(s.as_bytes()[0] as char) + && matches!(s.as_bytes()[1], b':' | b'|') +} + +fn starts_with_windows_drive_letter_segment(s: &str) -> bool { + s.len() >= 3 + && starts_with_windows_drive_letter(s) + && matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#') } diff --git a/src/percent_encoding.rs b/src/percent_encoding.rs index ee11cc3d..72b47245 100644 --- a/src/percent_encoding.rs +++ b/src/percent_encoding.rs @@ -1,4 +1,4 @@ -// Copyright 2013-2014 Simon Sapin. +// Copyright 2013-2016 The rust-url developers. // // Licensed under the Apache License, Version 2.0 or the MIT license @@ -6,9 +6,12 @@ // option. This file may not be copied, modified, or distributed // except according to those terms. - -#[path = "encode_sets.rs"] -mod encode_sets; +use encoding; +use std::ascii::AsciiExt; +use std::borrow::Cow; +use std::fmt::{self, Write}; +use std::slice; +use std::str; /// Represents a set of characters / bytes that should be percent-encoded. /// @@ -21,147 +24,321 @@ mod encode_sets; /// In the query string however, a question mark does not have any special meaning /// and does not need to be percent-encoded. /// -/// Since the implementation details of `EncodeSet` are private, -/// the set of available encode sets is not extensible beyond the ones -/// provided here. -/// If you need a different encode set, -/// please [file a bug](https://github.com/servo/rust-url/issues) -/// explaining the use case. -#[derive(Copy, Clone)] -pub struct EncodeSet { - map: &'static [&'static str; 256], +/// A few sets are defined in this module. +/// Use the [`define_encode_set!`](../macro.define_encode_set!.html) macro to define different ones. +pub trait EncodeSet: Clone { + /// Called with UTF-8 bytes rather than code points. + /// Should return true for all non-ASCII bytes. + fn contains(&self, byte: u8) -> bool; } -/// This encode set is used for fragment identifier and non-relative scheme data. -pub static SIMPLE_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::SIMPLE }; - -/// This encode set is used in the URL parser for query strings. -pub static QUERY_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::QUERY }; +/// Define a new struct +/// that implements the [`EncodeSet`](percent_encoding/trait.EncodeSet.html) trait, +/// for use in [`percent_decode()`](percent_encoding/fn.percent_encode.html) +/// and related functions. +/// +/// Parameters are characters to include in the set in addition to those of the base set. +/// See [encode sets specification](http://url.spec.whatwg.org/#simple-encode-set). +/// +/// Example +/// ======= +/// +/// ```rust +/// #[macro_use] extern crate url; +/// use url::percent_encoding::{utf8_percent_encode, SIMPLE_ENCODE_SET}; +/// define_encode_set! { +/// /// This encode set is used in the URL parser for query strings. +/// pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} +/// } +/// # fn main() { +/// assert_eq!(utf8_percent_encode("foo bar", QUERY_ENCODE_SET).collect::(), "foo%20bar"); +/// # } +/// ``` +#[macro_export] +macro_rules! define_encode_set { + ($(#[$attr: meta])* pub $name: ident = [$base_set: expr] | {$($ch: pat),*}) => { + $(#[$attr])* + #[derive(Copy, Clone)] + #[allow(non_camel_case_types)] + pub struct $name; -/// This encode set is used for path components. -pub static DEFAULT_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::DEFAULT }; + impl $crate::percent_encoding::EncodeSet for $name { + #[inline] + fn contains(&self, byte: u8) -> bool { + match byte as char { + $( + $ch => true, + )* + _ => $base_set.contains(byte) + } + } + } + } +} -/// This encode set is used in the URL parser for usernames and passwords. -pub static USERINFO_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::USERINFO }; +/// This encode set is used for the path of cannot-be-a-base URLs. +#[derive(Copy, Clone)] +#[allow(non_camel_case_types)] +pub struct SIMPLE_ENCODE_SET; -/// This encode set should be used when setting the password field of a parsed URL. -pub static PASSWORD_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::PASSWORD }; +impl EncodeSet for SIMPLE_ENCODE_SET { + #[inline] + fn contains(&self, byte: u8) -> bool { + byte < 0x20 || byte > 0x7E + } +} -/// This encode set should be used when setting the username field of a parsed URL. -pub static USERNAME_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::USERNAME }; +define_encode_set! { + /// This encode set is used in the URL parser for query strings. + pub QUERY_ENCODE_SET = [SIMPLE_ENCODE_SET] | {' ', '"', '#', '<', '>'} +} -/// This encode set is used in `application/x-www-form-urlencoded` serialization. -pub static FORM_URLENCODED_ENCODE_SET: EncodeSet = EncodeSet { - map: &encode_sets::FORM_URLENCODED, -}; +define_encode_set! { + /// This encode set is used for path components. + pub DEFAULT_ENCODE_SET = [QUERY_ENCODE_SET] | {'`', '?', '{', '}'} +} -/// This encode set is used for HTTP header values and is defined at -/// https://tools.ietf.org/html/rfc5987#section-3.2 -pub static HTTP_VALUE_ENCODE_SET: EncodeSet = EncodeSet { map: &encode_sets::HTTP_VALUE }; +define_encode_set! { + /// This encode set is used for on '/'-separated path segment + pub PATH_SEGMENT_ENCODE_SET = [DEFAULT_ENCODE_SET] | {'%', '/'} +} -/// Percent-encode the given bytes, and push the result to `output`. -/// -/// The pushed strings are within the ASCII range. -#[inline] -pub fn percent_encode_to(input: &[u8], encode_set: EncodeSet, output: &mut String) { - for &byte in input { - output.push_str(encode_set.map[byte as usize]) +define_encode_set! { + /// This encode set is used for username and password. + pub USERINFO_ENCODE_SET = [DEFAULT_ENCODE_SET] | { + '/', ':', ';', '=', '@', '[', '\\', ']', '^', '|' } } - -/// Percent-encode the given bytes. +/// Return the percent-encoding of the given bytes. /// -/// The returned string is within the ASCII range. -#[inline] -pub fn percent_encode(input: &[u8], encode_set: EncodeSet) -> String { - let mut output = String::new(); - percent_encode_to(input, encode_set, &mut output); - output +/// This is unconditional, unlike `percent_encode()` which uses an encode set. +pub fn percent_encode_byte(byte: u8) -> &'static str { + let index = usize::from(byte) * 3; + &"\ + %00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F\ + %10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F\ + %20%21%22%23%24%25%26%27%28%29%2A%2B%2C%2D%2E%2F\ + %30%31%32%33%34%35%36%37%38%39%3A%3B%3C%3D%3E%3F\ + %40%41%42%43%44%45%46%47%48%49%4A%4B%4C%4D%4E%4F\ + %50%51%52%53%54%55%56%57%58%59%5A%5B%5C%5D%5E%5F\ + %60%61%62%63%64%65%66%67%68%69%6A%6B%6C%6D%6E%6F\ + %70%71%72%73%74%75%76%77%78%79%7A%7B%7C%7D%7E%7F\ + %80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F\ + %90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F\ + %A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF\ + %B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF\ + %C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF\ + %D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF\ + %E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF\ + %F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF\ + "[index..index + 3] } - -/// Percent-encode the UTF-8 encoding of the given string, and push the result to `output`. +/// Percent-encode the given bytes with the given encode set. /// -/// The pushed strings are within the ASCII range. +/// The encode set define which bytes (in addition to non-ASCII and controls) +/// need to be percent-encoded. +/// The choice of this set depends on context. +/// For example, `?` needs to be encoded in an URL path but not in a query string. +/// +/// The return value is an iterator of `&str` slices (so it has a `.collect::()` method) +/// that also implements `Display` and `Into>`. +/// The latter returns `Cow::Borrowed` when none of the bytes in `input` +/// are in the given encode set. #[inline] -pub fn utf8_percent_encode_to(input: &str, encode_set: EncodeSet, output: &mut String) { - percent_encode_to(input.as_bytes(), encode_set, output) +pub fn percent_encode(input: &[u8], encode_set: E) -> PercentEncode { + PercentEncode { + bytes: input, + encode_set: encode_set, + } } - /// Percent-encode the UTF-8 encoding of the given string. /// -/// The returned string is within the ASCII range. +/// See `percent_encode()` for how to use the return value. #[inline] -pub fn utf8_percent_encode(input: &str, encode_set: EncodeSet) -> String { - let mut output = String::new(); - utf8_percent_encode_to(input, encode_set, &mut output); - output -} - - -/// Percent-decode the given bytes, and push the result to `output`. -pub fn percent_decode_to(input: &[u8], output: &mut Vec) { - let mut i = 0; - while i < input.len() { - let c = input[i]; - if c == b'%' && i + 2 < input.len() { - if let (Some(h), Some(l)) = (from_hex(input[i + 1]), from_hex(input[i + 2])) { - output.push(h * 0x10 + l); - i += 3; - continue +pub fn utf8_percent_encode(input: &str, encode_set: E) -> PercentEncode { + percent_encode(input.as_bytes(), encode_set) +} + +/// The return type of `percent_decode()`. +#[derive(Clone)] +pub struct PercentEncode<'a, E: EncodeSet> { + bytes: &'a [u8], + encode_set: E, +} + +impl<'a, E: EncodeSet> Iterator for PercentEncode<'a, E> { + type Item = &'a str; + + fn next(&mut self) -> Option<&'a str> { + if let Some((&first_byte, remaining)) = self.bytes.split_first() { + if self.encode_set.contains(first_byte) { + self.bytes = remaining; + Some(percent_encode_byte(first_byte)) + } else { + assert!(first_byte.is_ascii()); + for (i, &byte) in remaining.iter().enumerate() { + if self.encode_set.contains(byte) { + // 1 for first_byte + i for previous iterations of this loop + let (unchanged_slice, remaining) = self.bytes.split_at(1 + i); + self.bytes = remaining; + return Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) + } else { + assert!(byte.is_ascii()); + } + } + let unchanged_slice = self.bytes; + self.bytes = &[][..]; + Some(unsafe { str::from_utf8_unchecked(unchanged_slice) }) } + } else { + None } + } + + fn size_hint(&self) -> (usize, Option) { + if self.bytes.is_empty() { + (0, Some(0)) + } else { + (1, Some(self.bytes.len())) + } + } +} - output.push(c); - i += 1; +impl<'a, E: EncodeSet> fmt::Display for PercentEncode<'a, E> { + fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + for c in (*self).clone() { + try!(formatter.write_str(c)) + } + Ok(()) } } +impl<'a, E: EncodeSet> From> for Cow<'a, str> { + fn from(mut iter: PercentEncode<'a, E>) -> Self { + match iter.next() { + None => "".into(), + Some(first) => { + match iter.next() { + None => first.into(), + Some(second) => { + let mut string = first.to_owned(); + string.push_str(second); + string.extend(iter); + string.into() + } + } + } + } + } +} /// Percent-decode the given bytes. +/// +/// The return value is an iterator of decoded `u8` bytes +/// that also implements `Into>` +/// (which returns `Cow::Borrowed` when `input` contains no percent-encoded sequence) +/// and has `decode_utf8()` and `decode_utf8_lossy()` methods. #[inline] -pub fn percent_decode(input: &[u8]) -> Vec { - let mut output = Vec::new(); - percent_decode_to(input, &mut output); - output +pub fn percent_decode<'a>(input: &'a [u8]) -> PercentDecode<'a> { + PercentDecode { + bytes: input.iter() + } } +/// The return type of `percent_decode()`. +#[derive(Clone)] +pub struct PercentDecode<'a> { + bytes: slice::Iter<'a, u8>, +} -/// Percent-decode the given bytes, and decode the result as UTF-8. -/// -/// This is “lossy”: invalid UTF-8 percent-encoded byte sequences -/// will be replaced � U+FFFD, the replacement character. -#[inline] -pub fn lossy_utf8_percent_decode(input: &[u8]) -> String { - String::from_utf8_lossy(&percent_decode(input)).to_string() +fn after_percent_sign(iter: &mut slice::Iter) -> Option { + let initial_iter = iter.clone(); + let h = iter.next().and_then(|&b| (b as char).to_digit(16)); + let l = iter.next().and_then(|&b| (b as char).to_digit(16)); + if let (Some(h), Some(l)) = (h, l) { + Some(h as u8 * 0x10 + l as u8) + } else { + *iter = initial_iter; + None + } } -/// Convert the given hex character into its numeric value. -/// -/// # Examples -/// -/// ``` -/// use url::percent_encoding::from_hex; -/// assert_eq!(from_hex('0' as u8), Some(0)); -/// assert_eq!(from_hex('1' as u8), Some(1)); -/// assert_eq!(from_hex('9' as u8), Some(9)); -/// assert_eq!(from_hex('A' as u8), Some(10)); -/// assert_eq!(from_hex('a' as u8), Some(10)); -/// assert_eq!(from_hex('F' as u8), Some(15)); -/// assert_eq!(from_hex('f' as u8), Some(15)); -/// assert_eq!(from_hex('G' as u8), None); -/// assert_eq!(from_hex('g' as u8), None); -/// assert_eq!(from_hex('Z' as u8), None); -/// assert_eq!(from_hex('z' as u8), None); -/// ``` -#[inline] -pub fn from_hex(byte: u8) -> Option { - match byte { - b'0' ... b'9' => Some(byte - b'0'), // 0..9 - b'A' ... b'F' => Some(byte + 10 - b'A'), // A..F - b'a' ... b'f' => Some(byte + 10 - b'a'), // a..f - _ => None +impl<'a> Iterator for PercentDecode<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + self.bytes.next().map(|&byte| { + if byte == b'%' { + after_percent_sign(&mut self.bytes).unwrap_or(byte) + } else { + byte + } + }) + } + + fn size_hint(&self) -> (usize, Option) { + let bytes = self.bytes.len(); + (bytes / 3, Some(bytes)) + } +} + +impl<'a> From> for Cow<'a, [u8]> { + fn from(iter: PercentDecode<'a>) -> Self { + match iter.if_any() { + Some(vec) => Cow::Owned(vec), + None => Cow::Borrowed(iter.bytes.as_slice()), + } + } +} + +impl<'a> PercentDecode<'a> { + /// If the percent-decoding is different from the input, return it as a new bytes vector. + pub fn if_any(&self) -> Option> { + let mut bytes_iter = self.bytes.clone(); + while bytes_iter.find(|&&b| b == b'%').is_some() { + if let Some(decoded_byte) = after_percent_sign(&mut bytes_iter) { + let initial_bytes = self.bytes.as_slice(); + let unchanged_bytes_len = initial_bytes.len() - bytes_iter.len() - 3; + let mut decoded = initial_bytes[..unchanged_bytes_len].to_owned(); + decoded.push(decoded_byte); + decoded.extend(PercentDecode { + bytes: bytes_iter + }); + return Some(decoded) + } + } + // Nothing to decode + None + } + + /// Decode the result of percent-decoding as UTF-8. + /// + /// This is return `Err` when the percent-decoded bytes are not well-formed in UTF-8. + pub fn decode_utf8(self) -> Result, str::Utf8Error> { + match self.clone().into() { + Cow::Borrowed(bytes) => { + match str::from_utf8(bytes) { + Ok(s) => Ok(s.into()), + Err(e) => Err(e), + } + } + Cow::Owned(bytes) => { + match String::from_utf8(bytes) { + Ok(s) => Ok(s.into()), + Err(e) => Err(e.utf8_error()), + } + } + } + } + + /// Decode the result of percent-decoding as UTF-8, lossily. + /// + /// Invalid UTF-8 percent-encoded byte sequences will be replaced � U+FFFD, + /// the replacement character. + pub fn decode_utf8_lossy(self) -> Cow<'a, str> { + encoding::decode_utf8_lossy(self.clone().into()) } } diff --git a/src/quirks.rs b/src/quirks.rs new file mode 100644 index 00000000..3e25ac20 --- /dev/null +++ b/src/quirks.rs @@ -0,0 +1,218 @@ +// Copyright 2016 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Getters and setters for URL components implemented per https://url.spec.whatwg.org/#api +//! +//! Unless you need to be interoperable with web browsers, +//! you probably want to use `Url` method instead. + +use {Url, Position, Host, ParseError, idna}; +use parser::{Parser, SchemeType, default_port, Context}; + +/// https://url.spec.whatwg.org/#dom-url-domaintoascii +pub fn domain_to_ascii(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(domain)) => domain, + _ => String::new(), + } +} + +/// https://url.spec.whatwg.org/#dom-url-domaintounicode +pub fn domain_to_unicode(domain: &str) -> String { + match Host::parse(domain) { + Ok(Host::Domain(ref domain)) => { + let (unicode, _errors) = idna::domain_to_unicode(domain); + unicode + } + _ => String::new(), + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-href +pub fn href(url: &Url) -> &str { + url.as_str() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-href +pub fn set_href(url: &mut Url, value: &str) -> Result<(), ParseError> { + *url = try!(Url::parse(value)); + Ok(()) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-origin +pub fn origin(url: &Url) -> String { + url.origin().unicode_serialization() +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-protocol +#[inline] +pub fn protocol(url: &Url) -> &str { + &url.as_str()[..url.scheme().len() + ":".len()] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-protocol +pub fn set_protocol(url: &mut Url, mut new_protocol: &str) -> Result<(), ()> { + // The scheme state in the spec ignores everything after the first `:`, + // but `set_scheme` errors if there is more. + if let Some(position) = new_protocol.find(':') { + new_protocol = &new_protocol[..position]; + } + url.set_scheme(new_protocol) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-username +#[inline] +pub fn username(url: &Url) -> &str { + url.username() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-username +pub fn set_username(url: &mut Url, new_username: &str) -> Result<(), ()> { + url.set_username(new_username) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-password +#[inline] +pub fn password(url: &Url) -> &str { + url.password().unwrap_or("") +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-password +pub fn set_password(url: &mut Url, new_password: &str) -> Result<(), ()> { + url.set_password(if new_password.is_empty() { None } else { Some(new_password) }) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-host +#[inline] +pub fn host(url: &Url) -> &str { + &url[Position::BeforeHost..Position::AfterPort] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-host +pub fn set_host(url: &mut Url, new_host: &str) -> Result<(), ()> { + if url.cannot_be_a_base() { + return Err(()) + } + let host; + let opt_port; + { + let scheme = url.scheme(); + let result = Parser::parse_host(new_host, SchemeType::from(scheme), |_| ()); + match result { + Ok((h, remaining)) => { + host = h; + opt_port = if remaining.starts_with(':') { + Parser::parse_port(&remaining[1..], |_| (), || default_port(scheme), + Context::Setter) + .ok().map(|(port, _remaining)| port) + } else { + None + }; + } + Err(_) => return Err(()) + } + } + url.set_host_internal(host, opt_port); + Ok(()) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-hostname +#[inline] +pub fn hostname(url: &Url) -> &str { + url.host_str().unwrap_or("") +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-hostname +pub fn set_hostname(url: &mut Url, new_hostname: &str) -> Result<(), ()> { + if url.cannot_be_a_base() { + return Err(()) + } + let result = Parser::parse_host(new_hostname, SchemeType::from(url.scheme()), |_| ()); + if let Ok((host, _remaining)) = result { + url.set_host_internal(host, None); + Ok(()) + } else { + Err(()) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-port +#[inline] +pub fn port(url: &Url) -> &str { + &url[Position::BeforePort..Position::AfterPort] +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-port +pub fn set_port(url: &mut Url, new_port: &str) -> Result<(), ()> { + let result; + { + // has_host implies !cannot_be_a_base + let scheme = url.scheme(); + if !url.has_host() || scheme == "file" { + return Err(()) + } + result = Parser::parse_port(new_port, |_| (), || default_port(scheme), Context::Setter) + } + if let Ok((new_port, _remaining)) = result { + url.set_port_internal(new_port); + Ok(()) + } else { + Err(()) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-pathname +#[inline] +pub fn pathname(url: &Url) -> &str { + url.path() +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-pathname +pub fn set_pathname(url: &mut Url, new_pathname: &str) { + if !url.cannot_be_a_base() { + url.set_path(new_pathname) + } +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-search +pub fn search(url: &Url) -> &str { + trim(&url[Position::AfterPath..Position::AfterQuery]) +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-search +pub fn set_search(url: &mut Url, new_search: &str) { + url.set_query(match new_search { + "" => None, + _ if new_search.starts_with('?') => Some(&new_search[1..]), + _ => Some(new_search), + }) +} + +/// Getter for https://url.spec.whatwg.org/#dom-url-hash +pub fn hash(url: &Url) -> &str { + trim(&url[Position::AfterQuery..]) +} + +/// Setter for https://url.spec.whatwg.org/#dom-url-hash +pub fn set_hash(url: &mut Url, new_hash: &str) { + if url.scheme() != "javascript" { + url.set_fragment(match new_hash { + "" => None, + _ if new_hash.starts_with('#') => Some(&new_hash[1..]), + _ => Some(new_hash), + }) + } +} + +fn trim(s: &str) -> &str { + if s.len() == 1 { + "" + } else { + s + } +} diff --git a/src/slicing.rs b/src/slicing.rs new file mode 100644 index 00000000..926f3c79 --- /dev/null +++ b/src/slicing.rs @@ -0,0 +1,182 @@ +// Copyright 2016 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use std::ops::{Range, RangeFrom, RangeTo, RangeFull, Index}; +use Url; + +impl Index for Url { + type Output = str; + fn index(&self, _: RangeFull) -> &str { + &self.serialization + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: RangeFrom) -> &str { + &self.serialization[self.index(range.start)..] + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: RangeTo) -> &str { + &self.serialization[..self.index(range.end)] + } +} + +impl Index> for Url { + type Output = str; + fn index(&self, range: Range) -> &str { + &self.serialization[self.index(range.start)..self.index(range.end)] + } +} + +/// Indicates a position within a URL based on its components. +/// +/// A range of positions can be used for slicing `Url`: +/// +/// ```rust +/// # use url::{Url, Position}; +/// # fn something(some_url: Url) { +/// let serialization: &str = &some_url[..]; +/// let serialization_without_fragment: &str = &some_url[..Position::AfterQuery]; +/// let authority: &str = &some_url[Position::BeforeUsername..Position::AfterPort]; +/// let data_url_payload: &str = &some_url[Position::BeforePath..Position::AfterQuery]; +/// let scheme_relative: &str = &some_url[Position::BeforeUsername..]; +/// # } +/// ``` +/// +/// In a pseudo-grammar (where `[`…`]?` makes a sub-sequence optional), +/// URL components and delimiters that separate them are: +/// +/// ```notrust +/// url = +/// scheme ":" +/// [ "//" [ username [ ":" password ]? "@" ]? host [ ":" port ]? ]? +/// path [ "?" query ]? [ "#" fragment ]? +/// ``` +/// +/// When a given component is not present, +/// its "before" and "after" position are the same +/// (so that `&some_url[BeforeFoo..AfterFoo]` is the empty string) +/// and component ordering is preserved +/// (so that a missing query "is between" a path and a fragment). +/// +/// The end of a component and the start of the next are either the same or separate +/// by a delimiter. +/// (Not that the initial `/` of a path is considered part of the path here, not a delimiter.) +/// For example, `&url[..BeforeFragment]` would include a `#` delimiter (if present in `url`), +/// so `&url[..AfterQuery]` might be desired instead. +/// +/// `BeforeScheme` and `AfterFragment` are always the start and end of the entire URL, +/// so `&url[BeforeScheme..X]` is the same as `&url[..X]` +/// and `&url[X..AfterFragment]` is the same as `&url[X..]`. +#[derive(Copy, Clone, Debug)] +pub enum Position { + BeforeScheme, + AfterScheme, + BeforeUsername, + AfterUsername, + BeforePassword, + AfterPassword, + BeforeHost, + AfterHost, + BeforePort, + AfterPort, + BeforePath, + AfterPath, + BeforeQuery, + AfterQuery, + BeforeFragment, + AfterFragment +} + +impl Url { + #[inline] + fn index(&self, position: Position) -> usize { + match position { + Position::BeforeScheme => 0, + + Position::AfterScheme => self.scheme_end as usize, + + Position::BeforeUsername => if self.has_authority() { + self.scheme_end as usize + "://".len() + } else { + debug_assert!(self.byte_at(self.scheme_end) == b':'); + debug_assert!(self.scheme_end + ":".len() as u32 == self.username_end); + self.scheme_end as usize + ":".len() + }, + + Position::AfterUsername => self.username_end as usize, + + Position::BeforePassword => if self.has_authority() && + self.byte_at(self.username_end) == b':' { + self.username_end as usize + ":".len() + } else { + debug_assert!(self.username_end == self.host_start); + self.username_end as usize + }, + + Position::AfterPassword => if self.has_authority() && + self.byte_at(self.username_end) == b':' { + debug_assert!(self.byte_at(self.host_start - "@".len() as u32) == b'@'); + self.host_start as usize - "@".len() + } else { + debug_assert!(self.username_end == self.host_start); + self.host_start as usize + }, + + Position::BeforeHost => self.host_start as usize, + + Position::AfterHost => self.host_end as usize, + + Position::BeforePort => if self.port.is_some() { + debug_assert!(self.byte_at(self.host_end) == b':'); + self.host_end as usize + ":".len() + } else { + self.host_end as usize + }, + + Position::AfterPort => self.path_start as usize, + + Position::BeforePath => self.path_start as usize, + + Position::AfterPath => match (self.query_start, self.fragment_start) { + (Some(q), _) => q as usize, + (None, Some(f)) => f as usize, + (None, None) => self.serialization.len(), + }, + + Position::BeforeQuery => match (self.query_start, self.fragment_start) { + (Some(q), _) => { + debug_assert!(self.byte_at(q) == b'?'); + q as usize + "?".len() + } + (None, Some(f)) => f as usize, + (None, None) => self.serialization.len(), + }, + + Position::AfterQuery => match self.fragment_start { + None => self.serialization.len(), + Some(f) => f as usize, + }, + + Position::BeforeFragment => match self.fragment_start { + Some(f) => { + debug_assert!(self.byte_at(f) == b'#'); + f as usize + "#".len() + } + None => self.serialization.len(), + }, + + Position::AfterFragment => self.serialization.len(), + } + } +} + diff --git a/src/urlutils.rs b/src/urlutils.rs deleted file mode 100644 index cd57b501..00000000 --- a/src/urlutils.rs +++ /dev/null @@ -1,169 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - - -//! These methods are not meant for use in Rust code, -//! only to help implement the JavaScript URLUtils API: http://url.spec.whatwg.org/#urlutils - -use super::{Url, UrlParser, SchemeType, SchemeData, RelativeSchemeData}; -use parser::{ParseError, ParseResult, Context}; -use percent_encoding::{utf8_percent_encode_to, USERNAME_ENCODE_SET, PASSWORD_ENCODE_SET}; - - -#[allow(dead_code)] -pub struct UrlUtilsWrapper<'a> { - pub url: &'a mut Url, - pub parser: &'a UrlParser<'a>, -} - -#[doc(hidden)] -pub trait UrlUtils { - fn set_scheme(&mut self, input: &str) -> ParseResult<()>; - fn set_username(&mut self, input: &str) -> ParseResult<()>; - fn set_password(&mut self, input: &str) -> ParseResult<()>; - fn set_host_and_port(&mut self, input: &str) -> ParseResult<()>; - fn set_host(&mut self, input: &str) -> ParseResult<()>; - fn set_port(&mut self, input: &str) -> ParseResult<()>; - fn set_path(&mut self, input: &str) -> ParseResult<()>; - fn set_query(&mut self, input: &str) -> ParseResult<()>; - fn set_fragment(&mut self, input: &str) -> ParseResult<()>; -} - -impl<'a> UrlUtils for UrlUtilsWrapper<'a> { - /// `URLUtils.protocol` setter - fn set_scheme(&mut self, input: &str) -> ParseResult<()> { - match ::parser::parse_scheme(input, Context::Setter) { - Some((scheme, _)) => { - if self.parser.get_scheme_type(&self.url.scheme).same_as(self.parser.get_scheme_type(&scheme)) { - return Err(ParseError::InvalidScheme); - } - self.url.scheme = scheme; - Ok(()) - }, - None => Err(ParseError::InvalidScheme), - } - } - - /// `URLUtils.username` setter - fn set_username(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut username, .. }) => { - username.truncate(0); - utf8_percent_encode_to(input, USERNAME_ENCODE_SET, username); - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetUsernameWithNonRelativeScheme) - } - } - - /// `URLUtils.password` setter - fn set_password(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut password, .. }) => { - if input.len() == 0 { - *password = None; - return Ok(()); - } - let mut new_password = String::new(); - utf8_percent_encode_to(input, PASSWORD_ENCODE_SET, &mut new_password); - *password = Some(new_password); - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPasswordWithNonRelativeScheme) - } - } - - /// `URLUtils.host` setter - fn set_host_and_port(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { - ref mut host, ref mut port, ref mut default_port, .. - }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - let (new_host, new_port, new_default_port, _) = try!(::parser::parse_host( - input, scheme_type, self.parser)); - *host = new_host; - *port = new_port; - *default_port = new_default_port; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetHostPortWithNonRelativeScheme) - } - } - - /// `URLUtils.hostname` setter - fn set_host(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut host, .. }) => { - let (new_host, _) = try!(::parser::parse_hostname(input, self.parser)); - *host = new_host; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetHostWithNonRelativeScheme) - } - } - - /// `URLUtils.port` setter - fn set_port(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut port, ref mut default_port, .. }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - if scheme_type == SchemeType::FileLike { - return Err(ParseError::CannotSetPortWithFileLikeScheme); - } - let (new_port, new_default_port, _) = try!(::parser::parse_port( - input, scheme_type, self.parser)); - *port = new_port; - *default_port = new_default_port; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPortWithNonRelativeScheme) - } - } - - /// `URLUtils.pathname` setter - fn set_path(&mut self, input: &str) -> ParseResult<()> { - match self.url.scheme_data { - SchemeData::Relative(RelativeSchemeData { ref mut path, .. }) => { - let scheme_type = self.parser.get_scheme_type(&self.url.scheme); - let (new_path, _) = try!(::parser::parse_path_start( - input, Context::Setter, scheme_type, self.parser)); - *path = new_path; - Ok(()) - }, - SchemeData::NonRelative(_) => Err(ParseError::CannotSetPathWithNonRelativeScheme) - } - } - - /// `URLUtils.search` setter - fn set_query(&mut self, input: &str) -> ParseResult<()> { - self.url.query = if input.is_empty() { - None - } else { - let input = if input.starts_with("?") { &input[1..] } else { input }; - let (new_query, _) = try!(::parser::parse_query( - input, Context::Setter, self.parser)); - Some(new_query) - }; - Ok(()) - } - - /// `URLUtils.hash` setter - fn set_fragment(&mut self, input: &str) -> ParseResult<()> { - if self.url.scheme == "javascript" { - return Err(ParseError::CannotSetJavascriptFragment) - } - self.url.fragment = if input.is_empty() { - None - } else { - let input = if input.starts_with("#") { &input[1..] } else { input }; - Some(try!(::parser::parse_fragment(input, self.parser))) - }; - Ok(()) - } -} diff --git a/tests/data.rs b/tests/data.rs new file mode 100644 index 00000000..b8945aa4 --- /dev/null +++ b/tests/data.rs @@ -0,0 +1,193 @@ +// Copyright 2013-2014 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Data-driven tests + +extern crate rustc_serialize; +extern crate test; +extern crate url; + +use rustc_serialize::json::{self, Json}; +use url::{Url, quirks}; + + +fn run_parsing(input: String, base: String, expected: Result) { + let base = match Url::parse(&base) { + Ok(base) => base, + Err(message) => panic!("Error parsing base {:?}: {}", base, message) + }; + let (url, expected) = match (base.join(&input), expected) { + (Ok(url), Ok(expected)) => (url, expected), + (Err(_), Err(())) => return, + (Err(message), Ok(_)) => panic!("Error parsing URL {:?}: {}", input, message), + (Ok(_), Err(())) => panic!("Expected a parse error for URL {:?}", input), + }; + + url.assert_invariants(); + + macro_rules! assert_eq { + ($expected: expr, $got: expr) => { + { + let expected = $expected; + let got = $got; + assert!(expected == got, "{:?} != {} {:?} for URL {:?}", + got, stringify!($expected), expected, url); + } + } + } + + macro_rules! assert_attributes { + ($($attr: ident)+) => { + { + $( + assert_eq!(expected.$attr, quirks::$attr(&url)); + )+; + } + } + } + + assert_attributes!(href protocol username password host hostname port pathname search hash); + + if let Some(expected_origin) = expected.origin { + assert_eq!(expected_origin, quirks::origin(&url)); + } +} + +struct ExpectedAttributes { + href: String, + origin: Option, + protocol: String, + username: String, + password: String, + host: String, + hostname: String, + port: String, + pathname: String, + search: String, + hash: String, +} + +trait JsonExt { + fn take(&mut self, key: &str) -> Option; + fn object(self) -> json::Object; + fn string(self) -> String; + fn take_string(&mut self, key: &str) -> String; +} + +impl JsonExt for Json { + fn take(&mut self, key: &str) -> Option { + self.as_object_mut().unwrap().remove(key) + } + + fn object(self) -> json::Object { + if let Json::Object(o) = self { o } else { panic!("Not a Json::Object") } + } + + fn string(self) -> String { + if let Json::String(s) = self { s } else { panic!("Not a Json::String") } + } + + fn take_string(&mut self, key: &str) -> String { + self.take(key).unwrap().string() + } +} + +fn collect_parsing(add_test: &mut F) { + // Copied form https://github.com/w3c/web-platform-tests/blob/master/url/ + let mut json = Json::from_str(include_str!("urltestdata.json")) + .expect("JSON parse error in urltestdata.json"); + for entry in json.as_array_mut().unwrap() { + if entry.is_string() { + continue // ignore comments + } + let base = entry.take_string("base"); + let input = entry.take_string("input"); + let expected = if entry.find("failure").is_some() { + Err(()) + } else { + Ok(ExpectedAttributes { + href: entry.take_string("href"), + origin: entry.take("origin").map(Json::string), + protocol: entry.take_string("protocol"), + username: entry.take_string("username"), + password: entry.take_string("password"), + host: entry.take_string("host"), + hostname: entry.take_string("hostname"), + port: entry.take_string("port"), + pathname: entry.take_string("pathname"), + search: entry.take_string("search"), + hash: entry.take_string("hash"), + }) + }; + add_test(format!("{:?} @ base {:?}", input, base), + test::TestFn::dyn_test_fn(move || run_parsing(input, base, expected))); + } +} + +fn collect_setters(add_test: &mut F) where F: FnMut(String, test::TestFn) { + let mut json = Json::from_str(include_str!("setters_tests.json")) + .expect("JSON parse error in setters_tests.json"); + + macro_rules! setter { + ($attr: expr, $setter: ident) => {{ + let mut tests = json.take($attr).unwrap(); + for mut test in tests.as_array_mut().unwrap().drain(..) { + let comment = test.take("comment").map(Json::string).unwrap_or(String::new()); + let href = test.take_string("href"); + let new_value = test.take_string("new_value"); + let name = format!("{:?}.{} = {:?} {}", href, $attr, new_value, comment); + let mut expected = test.take("expected").unwrap(); + add_test(name, test::TestFn::dyn_test_fn(move || { + let mut url = Url::parse(&href).unwrap(); + url.assert_invariants(); + let _ = quirks::$setter(&mut url, &new_value); + assert_attributes!(url, expected, + href protocol username password host hostname port pathname search hash); + url.assert_invariants(); + })) + } + }} + } + macro_rules! assert_attributes { + ($url: expr, $expected: expr, $($attr: ident)+) => { + $( + if let Some(value) = $expected.take(stringify!($attr)) { + assert_eq!(quirks::$attr(&$url), value.string()) + } + )+ + } + } + setter!("protocol", set_protocol); + setter!("username", set_username); + setter!("password", set_password); + setter!("hostname", set_hostname); + setter!("host", set_host); + setter!("port", set_port); + setter!("pathname", set_pathname); + setter!("search", set_search); + setter!("hash", set_hash); +} + +fn main() { + let mut tests = Vec::new(); + { + let mut add_one = |name: String, run: test::TestFn| { + tests.push(test::TestDescAndFn { + desc: test::TestDesc { + name: test::DynTestName(name), + ignore: false, + should_panic: test::ShouldPanic::No, + }, + testfn: run, + }) + }; + collect_parsing(&mut add_one); + collect_setters(&mut add_one); + } + test::test_main(&std::env::args().collect::>(), tests) +} diff --git a/tests/form_urlencoded.rs b/tests/form_urlencoded.rs deleted file mode 100644 index 59080cf9..00000000 --- a/tests/form_urlencoded.rs +++ /dev/null @@ -1,29 +0,0 @@ -extern crate url; - -use url::form_urlencoded::*; - -#[test] -fn test_form_urlencoded() { - let pairs = &[ - ("foo".to_string(), "é&".to_string()), - ("bar".to_string(), "".to_string()), - ("foo".to_string(), "#".to_string()) - ]; - let encoded = serialize(pairs); - assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); - assert_eq!(parse(encoded.as_bytes()), pairs.to_vec()); -} - -#[test] -fn test_form_serialize() { - let pairs = [("foo", "é&"), - ("bar", ""), - ("foo", "#")]; - - let want = "foo=%C3%A9%26&bar=&foo=%23"; - // Works with referenced tuples - assert_eq!(serialize(pairs.iter()), want); - // Works with owned tuples - assert_eq!(serialize(pairs.iter().map(|p| (p.0, p.1))), want); - -} diff --git a/tests/format.rs b/tests/format.rs deleted file mode 100644 index 39aac62a..00000000 --- a/tests/format.rs +++ /dev/null @@ -1,67 +0,0 @@ -extern crate url; - -use url::{Url, Host}; -use url::format::{PathFormatter, UserInfoFormatter}; - -#[test] -fn path_formatting() { - let data = [ - (vec![], "/"), - (vec![""], "/"), - (vec!["test", "path"], "/test/path"), - (vec!["test", "path", ""], "/test/path/") - ]; - for &(ref path, result) in &data { - assert_eq!(PathFormatter { - path: path - }.to_string(), result.to_string()); - } -} - -#[test] -fn host() { - // libstd’s `Display for Ipv6Addr` serializes 0:0:0:0:0:0:_:_ and 0:0:0:0:0:ffff:_:_ - // using IPv4-like syntax, as suggested in https://tools.ietf.org/html/rfc5952#section-4 - // but https://url.spec.whatwg.org/#concept-ipv6-serializer specifies not to. - - // Not [::0.0.0.2] / [::ffff:0.0.0.2] - assert_eq!(Host::parse("[0::2]").unwrap().to_string(), "[::2]"); - assert_eq!(Host::parse("[0::ffff:0:2]").unwrap().to_string(), "[::ffff:0:2]"); -} - -#[test] -fn userinfo_formatting() { - // Test data as (username, password, result) tuples. - let data = [ - ("", None, ""), - ("", Some(""), ":@"), - ("", Some("password"), ":password@"), - ("username", None, "username@"), - ("username", Some(""), "username:@"), - ("username", Some("password"), "username:password@") - ]; - for &(username, password, result) in &data { - assert_eq!(UserInfoFormatter { - username: username, - password: password - }.to_string(), result.to_string()); - } -} - -#[test] -fn relative_scheme_url_formatting() { - let data = [ - ("http://example.com/", "http://example.com/"), - ("http://addslash.com", "http://addslash.com/"), - ("http://@emptyuser.com/", "http://emptyuser.com/"), - ("http://:@emptypass.com/", "http://:@emptypass.com/"), - ("http://user@user.com/", "http://user@user.com/"), - ("http://user:pass@userpass.com/", "http://user:pass@userpass.com/"), - ("http://slashquery.com/path/?q=something", "http://slashquery.com/path/?q=something"), - ("http://noslashquery.com/path?q=something", "http://noslashquery.com/path?q=something") - ]; - for &(input, result) in &data { - let url = Url::parse(input).unwrap(); - assert_eq!(url.to_string(), result.to_string()); - } -} diff --git a/tests/idna.rs b/tests/idna.rs deleted file mode 100644 index bb03f39d..00000000 --- a/tests/idna.rs +++ /dev/null @@ -1,104 +0,0 @@ -extern crate url; - -use std::char; -use url::idna; - -#[test] -fn test_uts46() { - // http://www.unicode.org/Public/idna/latest/IdnaTest.txt - for line in include_str!("IdnaTest.txt").lines() { - if line == "" || line.starts_with("#") { - continue - } - // Remove comments - let mut line = match line.find("#") { - Some(index) => &line[0..index], - None => line - }; - - let mut expected_failure = false; - if line.starts_with("XFAIL") { - expected_failure = true; - line = &line[5..line.len()]; - }; - - let mut pieces = line.split(';').map(|x| x.trim()).collect::>(); - - let test_type = pieces.remove(0); - let original = pieces.remove(0); - let source = unescape(original); - let to_unicode = pieces.remove(0); - let to_ascii = pieces.remove(0); - let _nv8 = if pieces.len() > 0 { pieces.remove(0) } else { "" }; - - if expected_failure { - continue; - } - - let result = idna::uts46_to_ascii(&source, idna::Uts46Flags { - use_std3_ascii_rules: true, - transitional_processing: test_type == "T", - verify_dns_length: true, - }); - - if to_ascii.starts_with("[") { - if to_ascii.starts_with("[C") { - // http://unicode.org/reports/tr46/#Deviations - // applications that perform IDNA2008 lookup are not required to check for these contexts - continue; - } - let res = result.ok(); - assert!(res == None, "Expected error. result: {} | original: {} | source: {}", res.unwrap(), original, source); - continue; - } - - let to_ascii = if to_ascii.len() > 0 { - to_ascii.to_string() - } else { - if to_unicode.len() > 0 { - to_unicode.to_string() - } else { - source.clone() - } - }; - - if _nv8 == "NV8" { - // This result isn't valid under IDNA2008. Skip it - continue; - } - - assert!(result.is_ok(), "Couldn't parse {} | original: {} | error: {:?}", source, original, result.err()); - let output = result.ok().unwrap(); - assert!(output == to_ascii, "result: {} | expected: {} | original: {} | source: {}", output, to_ascii, original, source); - } -} - -fn unescape(input: &str) -> String { - let mut output = String::new(); - let mut chars = input.chars(); - loop { - match chars.next() { - None => return output, - Some(c) => - if c == '\\' { - match chars.next().unwrap() { - '\\' => output.push('\\'), - 'u' => { - let c1 = chars.next().unwrap().to_digit(16).unwrap(); - let c2 = chars.next().unwrap().to_digit(16).unwrap(); - let c3 = chars.next().unwrap().to_digit(16).unwrap(); - let c4 = chars.next().unwrap().to_digit(16).unwrap(); - match char::from_u32((((c1 * 16 + c2) * 16 + c3) * 16 + c4)) - { - Some(c) => output.push(c), - None => { output.push_str(&format!("\\u{:X}{:X}{:X}{:X}",c1,c2,c3,c4)); } - }; - } - _ => panic!("Invalid test data input"), - } - } else { - output.push(c); - } - } - } -} diff --git a/tests/punycode.rs b/tests/punycode.rs deleted file mode 100644 index ae42b34d..00000000 --- a/tests/punycode.rs +++ /dev/null @@ -1,52 +0,0 @@ -extern crate url; -extern crate rustc_serialize; - -use url::punycode::{decode, encode_str}; -use rustc_serialize::json::{Json, Object}; - -fn one_test(description: &str, decoded: &str, encoded: &str) { - match decode(encoded) { - None => panic!("Decoding {} failed.", encoded), - Some(result) => { - let result = result.into_iter().collect::(); - assert!(result == decoded, - format!("Incorrect decoding of {}:\n {}\n!= {}\n{}", - encoded, result, decoded, description)) - } - } - - match encode_str(decoded) { - None => panic!("Encoding {} failed.", decoded), - Some(result) => { - assert!(result == encoded, - format!("Incorrect encoding of {}:\n {}\n!= {}\n{}", - decoded, result, encoded, description)) - } - } -} - -fn get_string<'a>(map: &'a Object, key: &str) -> &'a str { - match map.get(&key.to_string()) { - Some(&Json::String(ref s)) => s, - None => "", - _ => panic!(), - } -} - -#[test] -fn test_punycode() { - - match Json::from_str(include_str!("punycode_tests.json")) { - Ok(Json::Array(tests)) => for test in &tests { - match test { - &Json::Object(ref o) => one_test( - get_string(o, "description"), - get_string(o, "decoded"), - get_string(o, "encoded") - ), - _ => panic!(), - } - }, - other => panic!("{:?}", other) - } -} diff --git a/tests/setters_tests.json b/tests/setters_tests.json new file mode 100644 index 00000000..b60e49a9 --- /dev/null +++ b/tests/setters_tests.json @@ -0,0 +1,1148 @@ +{ + "comment": [ + "## Tests for setters of https://url.spec.whatwg.org/#urlutils-members", + "", + "This file contains a JSON object.", + "Other than 'comment', each key is an attribute of the `URL` interface", + "defined in WHATWG’s URL Standard.", + "The values are arrays of test case objects for that attribute.", + "", + "To run a test case for the attribute `attr`:", + "", + "* Create a new `URL` object with the value for the 'href' key", + " the constructor single parameter. (Without a base URL.)", + " This must not throw.", + "* Set the attribute `attr` to (invoke its setter with)", + " with the value of for 'new_value' key.", + "* The value for the 'expected' key is another object.", + " For each `key` / `value` pair of that object,", + " get the attribute `key` (invoke its getter).", + " The returned string must be equal to `value`.", + "", + "Note: the 'href' setter is already covered by urltestdata.json." + ], + "protocol": [ + { + "comment": "The empty string is not a valid scheme. Setter leaves the URL unchanged.", + "href": "a://example.net", + "new_value": "", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "href": "a://example.net", + "new_value": "b", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Upper-case ASCII is lower-cased", + "href": "a://example.net", + "new_value": "B", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Non-ASCII is rejected", + "href": "a://example.net", + "new_value": "é", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "No leading digit", + "href": "a://example.net", + "new_value": "0b", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "No leading punctuation", + "href": "a://example.net", + "new_value": "+b", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "href": "a://example.net", + "new_value": "bC0+-.", + "expected": { + "href": "bc0+-.://example.net/", + "protocol": "bc0+-.:" + } + }, + { + "comment": "Only some punctuation is acceptable", + "href": "a://example.net", + "new_value": "b,c", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "Non-ASCII is rejected", + "href": "a://example.net", + "new_value": "bé", + "expected": { + "href": "a://example.net/", + "protocol": "a:" + } + }, + { + "comment": "Spec deviation: from special scheme to not is not problematic. https://github.com/whatwg/url/issues/104", + "href": "http://example.net", + "new_value": "b", + "expected": { + "href": "b://example.net/", + "protocol": "b:" + } + }, + { + "comment": "Cannot-be-a-base URL doesn’t have a host, but URL in a special scheme must.", + "href": "mailto:me@example.net", + "new_value": "http", + "expected": { + "href": "mailto:me@example.net", + "protocol": "mailto:" + } + }, + { + "comment": "Spec deviation: from non-special scheme with a host to special is not problematic. https://github.com/whatwg/url/issues/104", + "href": "ssh://me@example.net", + "new_value": "http", + "expected": { + "href": "http://me@example.net/", + "protocol": "http:" + } + }, + { + "comment": "Stuff after the first ':' is ignored", + "href": "http://example.net", + "new_value": "https:foo : bar", + "expected": { + "href": "https://example.net/", + "protocol": "https:" + } + }, + { + "comment": "Stuff after the first ':' is ignored", + "href": "data:text/html,

Test", + "new_value": "view-source+data:foo : bar", + "expected": { + "href": "view-source+data:text/html,

Test", + "protocol": "view-source+data:" + } + } + ], + "username": [ + { + "comment": "No host means no username", + "href": "file:///home/you/index.html", + "new_value": "me", + "expected": { + "href": "file:///home/you/index.html", + "username": "" + } + }, + { + "comment": "No host means no username", + "href": "unix:/run/foo.socket", + "new_value": "me", + "expected": { + "href": "unix:/run/foo.socket", + "username": "" + } + }, + { + "comment": "Cannot-be-a-base means no username", + "href": "mailto:you@example.net", + "new_value": "me", + "expected": { + "href": "mailto:you@example.net", + "username": "" + } + }, + { + "href": "http://example.net", + "new_value": "me", + "expected": { + "href": "http://me@example.net/", + "username": "me" + } + }, + { + "href": "http://:secret@example.net", + "new_value": "me", + "expected": { + "href": "http://me:secret@example.net/", + "username": "me" + } + }, + { + "href": "http://me@example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "username": "" + } + }, + { + "href": "http://me:secret@example.net", + "new_value": "", + "expected": { + "href": "http://:secret@example.net/", + "username": "" + } + }, + { + "comment": "UTF-8 percent encoding with the userinfo encode set.", + "href": "http://example.net", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "http://%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9@example.net/", + "username": "%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is.", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://%c3%89t%C3%A9@example.net/", + "username": "%c3%89t%C3%A9" + } + } + ], + "password": [ + { + "comment": "No host means no password", + "href": "file:///home/me/index.html", + "new_value": "secret", + "expected": { + "href": "file:///home/me/index.html", + "password": "" + } + }, + { + "comment": "No host means no password", + "href": "unix:/run/foo.socket", + "new_value": "secret", + "expected": { + "href": "unix:/run/foo.socket", + "password": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "mailto:me@example.net", + "new_value": "secret", + "expected": { + "href": "mailto:me@example.net", + "password": "" + } + }, + { + "href": "http://example.net", + "new_value": "secret", + "expected": { + "href": "http://:secret@example.net/", + "password": "secret" + } + }, + { + "href": "http://me@example.net", + "new_value": "secret", + "expected": { + "href": "http://me:secret@example.net/", + "password": "secret" + } + }, + { + "href": "http://:secret@example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "password": "" + } + }, + { + "href": "http://me:secret@example.net", + "new_value": "", + "expected": { + "href": "http://me@example.net/", + "password": "" + } + }, + { + "comment": "UTF-8 percent encoding with the userinfo encode set.", + "href": "http://example.net", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "http://:%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9@example.net/", + "password": "%00%01%09%0A%0D%1F%20!%22%23$%&'()*+,-.%2F09%3A%3B%3C%3D%3E%3F%40AZ%5B%5C%5D%5E_%60az%7B%7C%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is.", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://:%c3%89t%C3%A9@example.net/", + "password": "%c3%89t%C3%A9" + } + } + ], + "host": [ + { + "comment": "Cannot-be-a-base means no host", + "href": "mailto:me@example.net", + "new_value": "example.com", + "expected": { + "href": "mailto:me@example.net", + "host": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "data:text/plain,Stuff", + "new_value": "example.net", + "expected": { + "href": "data:text/plain,Stuff", + "host": "" + } + }, + { + "href": "http://example.net", + "new_value": "example.com:8080", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port number is unchanged if not specified in the new value", + "href": "http://example.net:8080", + "new_value": "example.com", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port number is removed if empty in the new value: https://github.com/whatwg/url/pull/113", + "href": "http://example.net:8080", + "new_value": "example.com:", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "The empty host is not valid for special schemes", + "href": "http://example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net" + } + }, + { + "comment": "The empty host is OK for non-special schemes", + "href": "view-source+http://example.net/foo", + "new_value": "", + "expected": { + "href": "view-source+http:///foo", + "host": "" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "IPv4 address syntax is normalized", + "href": "http://example.net", + "new_value": "0x7F000001:8080", + "expected": { + "href": "http://127.0.0.1:8080/", + "host": "127.0.0.1:8080", + "hostname": "127.0.0.1", + "port": "8080" + } + }, + { + "comment": "IPv6 address syntax is normalized", + "href": "http://example.net", + "new_value": "[::0:01]:2", + "expected": { + "href": "http://[::1]:2/", + "host": "[::1]:2", + "hostname": "[::1]", + "port": "2" + } + }, + { + "comment": "Default port number is removed", + "href": "http://example.net", + "new_value": "example.com:80", + "expected": { + "href": "http://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "https://example.net", + "new_value": "example.com:443", + "expected": { + "href": "https://example.com/", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Default port number is only removed for the relevant scheme", + "href": "https://example.net", + "new_value": "example.com:80", + "expected": { + "href": "https://example.com:80/", + "host": "example.com:80", + "hostname": "example.com", + "port": "80" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com/stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080/stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com?stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080?stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com#stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080#stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "example.com:8080\\stuff", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "\\ is not a delimiter for non-special schemes, and it’s invalid in a domain", + "href": "view-source+http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "view-source+http://example.net/path", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "view-source+http://example.net/path", + "new_value": "example.com:8080stuff2", + "expected": { + "href": "view-source+http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "example.com:8080stuff2", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "example.com:8080+2", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Port numbers are 16 bit integers", + "href": "http://example.net/path", + "new_value": "example.com:65535", + "expected": { + "href": "http://example.com:65535/path", + "host": "example.com:65535", + "hostname": "example.com", + "port": "65535" + } + }, + { + "comment": "Port numbers are 16 bit integers, overflowing is an error. Hostname is still set, though.", + "href": "http://example.net/path", + "new_value": "example.com:65536", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + } + ], + "hostname": [ + { + "comment": "Cannot-be-a-base means no host", + "href": "mailto:me@example.net", + "new_value": "example.com", + "expected": { + "href": "mailto:me@example.net", + "host": "" + } + }, + { + "comment": "Cannot-be-a-base means no password", + "href": "data:text/plain,Stuff", + "new_value": "example.net", + "expected": { + "href": "data:text/plain,Stuff", + "host": "" + } + }, + { + "href": "http://example.net:8080", + "new_value": "example.com", + "expected": { + "href": "http://example.com:8080/", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "The empty host is not valid for special schemes", + "href": "http://example.net", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net" + } + }, + { + "comment": "The empty host is OK for non-special schemes", + "href": "view-source+http://example.net/foo", + "new_value": "", + "expected": { + "href": "view-source+http:///foo", + "host": "" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "Path-only URLs can gain a host", + "href": "a:/foo", + "new_value": "example.net", + "expected": { + "href": "a://example.net/foo", + "host": "example.net" + } + }, + { + "comment": "IPv4 address syntax is normalized", + "href": "http://example.net:8080", + "new_value": "0x7F000001", + "expected": { + "href": "http://127.0.0.1:8080/", + "host": "127.0.0.1:8080", + "hostname": "127.0.0.1", + "port": "8080" + } + }, + { + "comment": "IPv6 address syntax is normalized", + "href": "http://example.net", + "new_value": "[::0:01]", + "expected": { + "href": "http://[::1]/", + "host": "[::1]", + "hostname": "[::1]", + "port": "" + } + }, + { + "comment": "Stuff after a : delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com:8080", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a : delimiter is ignored", + "href": "http://example.net:8080/path", + "new_value": "example.com:", + "expected": { + "href": "http://example.com:8080/path", + "host": "example.com:8080", + "hostname": "example.com", + "port": "8080" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com/stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com?stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "example.com#stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "http://example.com/path", + "host": "example.com", + "hostname": "example.com", + "port": "" + } + }, + { + "comment": "\\ is not a delimiter for non-special schemes, and it’s invalid in a domain", + "href": "view-source+http://example.net/path", + "new_value": "example.com\\stuff", + "expected": { + "href": "view-source+http://example.net/path", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + } + ], + "port": [ + { + "href": "http://example.net", + "new_value": "8080", + "expected": { + "href": "http://example.net:8080/", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Port number is removed if empty in the new value: https://github.com/whatwg/url/pull/113", + "href": "http://example.net:8080", + "new_value": "", + "expected": { + "href": "http://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "http://example.net:8080", + "new_value": "80", + "expected": { + "href": "http://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is removed", + "href": "https://example.net:4433", + "new_value": "443", + "expected": { + "href": "https://example.net/", + "host": "example.net", + "hostname": "example.net", + "port": "" + } + }, + { + "comment": "Default port number is only removed for the relevant scheme", + "href": "https://example.net", + "new_value": "80", + "expected": { + "href": "https://example.net:80/", + "host": "example.net:80", + "hostname": "example.net", + "port": "80" + } + }, + { + "comment": "Stuff after a / delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080/stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a ? delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080?stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a # delimiter is ignored", + "href": "http://example.net/path", + "new_value": "8080#stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Stuff after a \\ delimiter is ignored for special schemes", + "href": "http://example.net/path", + "new_value": "8080\\stuff", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "view-source+http://example.net/path", + "new_value": "8080stuff2", + "expected": { + "href": "view-source+http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "8080stuff2", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Anything other than ASCII digit stops the port parser in a setter but is not an error", + "href": "http://example.net/path", + "new_value": "8080+2", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + }, + { + "comment": "Port numbers are 16 bit integers", + "href": "http://example.net/path", + "new_value": "65535", + "expected": { + "href": "http://example.net:65535/path", + "host": "example.net:65535", + "hostname": "example.net", + "port": "65535" + } + }, + { + "comment": "Port numbers are 16 bit integers, overflowing is an error", + "href": "http://example.net:8080/path", + "new_value": "65536", + "expected": { + "href": "http://example.net:8080/path", + "host": "example.net:8080", + "hostname": "example.net", + "port": "8080" + } + } + ], + "pathname": [ + { + "comment": "Cannot-be-a-base don’t have a path", + "href": "mailto:me@example.net", + "new_value": "/foo", + "expected": { + "href": "mailto:me@example.net", + "pathname": "me@example.net" + } + }, + { + "href": "unix:/run/foo.socket?timeout=10", + "new_value": "/var/log/../run/bar.socket", + "expected": { + "href": "unix:/var/run/bar.socket?timeout=10", + "pathname": "/var/run/bar.socket" + } + }, + { + "href": "https://example.net#nav", + "new_value": "home", + "expected": { + "href": "https://example.net/home#nav", + "pathname": "/home" + } + }, + { + "href": "https://example.net#nav", + "new_value": "../home", + "expected": { + "href": "https://example.net/home#nav", + "pathname": "/home" + } + }, + { + "comment": "\\ is a segment delimiter for 'special' URLs", + "href": "http://example.net/home?lang=fr#nav", + "new_value": "\\a\\%2E\\b\\%2e.\\c", + "expected": { + "href": "http://example.net/a/c?lang=fr#nav", + "pathname": "/a/c" + } + }, + { + "comment": "\\ is *not* a segment delimiter for non-'special' URLs", + "href": "view-source+http://example.net/home?lang=fr#nav", + "new_value": "\\a\\%2E\\b\\%2e.\\c", + "expected": { + "href": "view-source+http://example.net/\\a\\.\\b\\..\\c?lang=fr#nav", + "pathname": "/\\a\\.\\b\\..\\c" + } + }, + { + "comment": "UTF-8 percent encoding with the default encode set. Tabs and newlines are removed.", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9", + "pathname": "/%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E%3F@AZ[\\]^_%60az%7B|%7D~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is, except %2E.", + "href": "http://example.net", + "new_value": "%2e%2E%c3%89té", + "expected": { + "href": "http://example.net/..%c3%89t%C3%A9", + "pathname": "/..%c3%89t%C3%A9" + } + } + ], + "search": [ + { + "href": "https://example.net#nav", + "new_value": "lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "?lang=fr", + "expected": { + "href": "https://example.net/?lang=fr#nav", + "search": "?lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "??lang=fr", + "expected": { + "href": "https://example.net/??lang=fr#nav", + "search": "??lang=fr" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "?", + "expected": { + "href": "https://example.net/?#nav", + "search": "" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "", + "expected": { + "href": "https://example.net/#nav", + "search": "" + } + }, + { + "href": "https://example.net?lang=en-US", + "new_value": "", + "expected": { + "href": "https://example.net/", + "search": "" + } + }, + { + "href": "https://example.net", + "new_value": "", + "expected": { + "href": "https://example.net/", + "search": "" + } + }, + { + "comment": "UTF-8 percent encoding with the query encode set. Tabs and newlines are removed.", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9", + "search": "?%00%01%1F%20!%22%23$%&'()*+,-./09:;%3C=%3E?@AZ[\\]^_`az{|}~%7F%C2%80%C2%81%C3%89%C3%A9" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://example.net/?%c3%89t%C3%A9", + "search": "?%c3%89t%C3%A9" + } + } + ], + "hash": [ + { + "href": "https://example.net", + "new_value": "main", + "expected": { + "href": "https://example.net/#main", + "hash": "#main" + } + }, + { + "href": "https://example.net#nav", + "new_value": "main", + "expected": { + "href": "https://example.net/#main", + "hash": "#main" + } + }, + { + "href": "https://example.net?lang=en-US", + "new_value": "##nav", + "expected": { + "href": "https://example.net/?lang=en-US##nav", + "hash": "##nav" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "#main", + "expected": { + "href": "https://example.net/?lang=en-US#main", + "hash": "#main" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "#", + "expected": { + "href": "https://example.net/?lang=en-US#", + "hash": "" + } + }, + { + "href": "https://example.net?lang=en-US#nav", + "new_value": "", + "expected": { + "href": "https://example.net/?lang=en-US", + "hash": "" + } + }, + { + "comment": "No percent-encoding at all (!); nuls, tabs, and newlines are removed", + "href": "a:/", + "new_value": "\u0000\u0001\t\n\r\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "expected": { + "href": "a:/#\u0001\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé", + "hash": "#\u0001\u001f !\"#$%&'()*+,-./09:;<=>?@AZ[\\]^_`az{|}~\u007f\u0080\u0081Éé" + } + }, + { + "comment": "Bytes already percent-encoded are left as-is", + "href": "http://example.net", + "new_value": "%c3%89té", + "expected": { + "href": "http://example.net/#%c3%89té", + "hash": "#%c3%89té" + } + } + ] +} diff --git a/tests/tests.rs b/tests/tests.rs deleted file mode 100644 index 11d35cde..00000000 --- a/tests/tests.rs +++ /dev/null @@ -1,191 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -extern crate url; - -use std::net::{Ipv4Addr, Ipv6Addr}; -use url::{Host, Url}; - -#[test] -fn new_file_paths() { - use std::path::{Path, PathBuf}; - if cfg!(unix) { - assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); - assert_eq!(Url::from_file_path(Path::new("../relative")), Err(())); - } else { - assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); - assert_eq!(Url::from_file_path(Path::new(r"..\relative")), Err(())); - assert_eq!(Url::from_file_path(Path::new(r"\drive-relative")), Err(())); - assert_eq!(Url::from_file_path(Path::new(r"\\ucn\")), Err(())); - } - - if cfg!(unix) { - let mut url = Url::from_file_path(Path::new("/foo/bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["foo".to_string(), "bar".to_string()][..])); - assert!(url.to_file_path() == Ok(PathBuf::from("/foo/bar"))); - - url.path_mut().unwrap()[1] = "ba\0r".to_string(); - url.to_file_path().is_ok(); - - url.path_mut().unwrap()[1] = "ba%00r".to_string(); - url.to_file_path().is_ok(); - } -} - -#[test] -#[cfg(unix)] -fn new_path_bad_utf8() { - use std::ffi::OsStr; - use std::os::unix::prelude::*; - use std::path::{Path, PathBuf}; - - let url = Url::from_file_path(Path::new("/foo/ba%80r")).unwrap(); - let os_str = OsStr::from_bytes(b"/foo/ba\x80r"); - assert_eq!(url.to_file_path(), Ok(PathBuf::from(os_str))); -} - -#[test] -fn new_path_windows_fun() { - if cfg!(windows) { - use std::path::{Path, PathBuf}; - let mut url = Url::from_file_path(Path::new(r"C:\foo\bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["C:".to_string(), "foo".to_string(), "bar".to_string()][..])); - assert_eq!(url.to_file_path(), - Ok(PathBuf::from(r"C:\foo\bar"))); - - url.path_mut().unwrap()[2] = "ba\0r".to_string(); - assert!(url.to_file_path().is_ok()); - - url.path_mut().unwrap()[2] = "ba%00r".to_string(); - assert!(url.to_file_path().is_ok()); - - // Invalid UTF-8 - url.path_mut().unwrap()[2] = "ba%80r".to_string(); - assert!(url.to_file_path().is_err()); - - // test windows canonicalized path - let path = PathBuf::from(r"\\?\C:\foo\bar"); - assert!(Url::from_file_path(path).is_ok()); - } -} - - -#[test] -fn new_directory_paths() { - use std::path::Path; - - if cfg!(unix) { - assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); - assert_eq!(Url::from_directory_path(Path::new("../relative")), Err(())); - - let url = Url::from_directory_path(Path::new("/foo/bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["foo".to_string(), "bar".to_string(), - "".to_string()][..])); - } else { - assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); - assert_eq!(Url::from_directory_path(Path::new(r"..\relative")), Err(())); - assert_eq!(Url::from_directory_path(Path::new(r"\drive-relative")), Err(())); - assert_eq!(Url::from_directory_path(Path::new(r"\\ucn\")), Err(())); - - let url = Url::from_directory_path(Path::new(r"C:\foo\bar")).unwrap(); - assert_eq!(url.host(), Some(&Host::Domain("".to_string()))); - assert_eq!(url.path(), Some(&["C:".to_string(), "foo".to_string(), - "bar".to_string(), "".to_string()][..])); - } -} - -#[test] -fn from_str() { - assert!("http://testing.com/this".parse::().is_ok()); -} - -#[test] -fn issue_124() { - let url: Url = "file:a".parse().unwrap(); - assert_eq!(url.path().unwrap(), ["a"]); - let url: Url = "file:...".parse().unwrap(); - assert_eq!(url.path().unwrap(), ["..."]); - let url: Url = "file:..".parse().unwrap(); - assert_eq!(url.path().unwrap(), [""]); -} - -#[test] -fn relative_scheme_data_equality() { - use std::hash::{Hash, Hasher, SipHasher}; - - fn check_eq(a: &Url, b: &Url) { - assert_eq!(a, b); - - let mut h1 = SipHasher::new(); - a.hash(&mut h1); - let mut h2 = SipHasher::new(); - b.hash(&mut h2); - assert_eq!(h1.finish(), h2.finish()); - } - - fn url(s: &str) -> Url { - let rv = s.parse().unwrap(); - check_eq(&rv, &rv); - rv - } - - // Doesn't care if default port is given. - let a: Url = url("https://example.com/"); - let b: Url = url("https://example.com:443/"); - check_eq(&a, &b); - - // Different ports - let a: Url = url("http://example.com/"); - let b: Url = url("http://example.com:8080/"); - assert!(a != b); - - // Different scheme - let a: Url = url("http://example.com/"); - let b: Url = url("https://example.com/"); - assert!(a != b); - - // Different host - let a: Url = url("http://foo.com/"); - let b: Url = url("http://bar.com/"); - assert!(a != b); - - // Missing path, automatically substituted. Semantically the same. - let a: Url = url("http://foo.com"); - let b: Url = url("http://foo.com/"); - check_eq(&a, &b); -} - -#[test] -fn host() { - let a = Host::parse("www.mozilla.org").unwrap(); - let b = Host::parse("1.35.33.49").unwrap(); - let c = Host::parse("[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]").unwrap(); - let d = Host::parse("1.35.+33.49").unwrap(); - assert_eq!(a, Host::Domain("www.mozilla.org".to_owned())); - assert_eq!(b, Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert_eq!(c, Host::Ipv6(Ipv6Addr::new(0x2001, 0x0db8, 0x85a3, 0x08d3, - 0x1319, 0x8a2e, 0x0370, 0x7344))); - assert_eq!(d, Host::Domain("1.35.+33.49".to_owned())); - assert_eq!(Host::parse("[::]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 0))); - assert_eq!(Host::parse("[::1]").unwrap(), Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))); - assert_eq!(Host::parse("0x1.0X23.0x21.061").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert_eq!(Host::parse("0x1232131").unwrap(), Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); - assert!(Host::parse("42.0x1232131").is_err()); - assert_eq!(Host::parse("111").unwrap(), Host::Ipv4(Ipv4Addr::new(0, 0, 0, 111))); - assert_eq!(Host::parse("2..2.3").unwrap(), Host::Domain("2..2.3".to_owned())); - assert!(Host::parse("192.168.0.257").is_err()); -} - -#[test] -fn test_idna() { - assert!("http://goșu.ro".parse::().is_ok()); - assert_eq!(Url::parse("http://☃.net/").unwrap().domain(), Some("xn--n3h.net")); -} diff --git a/tests/unit.rs b/tests/unit.rs new file mode 100644 index 00000000..6038e1f9 --- /dev/null +++ b/tests/unit.rs @@ -0,0 +1,235 @@ +// Copyright 2013-2014 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +//! Unit tests + +extern crate url; + +use std::borrow::Cow; +use std::net::{Ipv4Addr, Ipv6Addr}; +use std::path::{Path, PathBuf}; +use url::{Host, Url, form_urlencoded}; + +macro_rules! assert_from_file_path { + ($path: expr) => { assert_from_file_path!($path, $path) }; + ($path: expr, $url_path: expr) => {{ + let url = Url::from_file_path(Path::new($path)).unwrap(); + assert_eq!(url.host(), None); + assert_eq!(url.path(), $url_path); + assert_eq!(url.to_file_path(), Ok(PathBuf::from($path))); + }}; +} + + + +#[test] +fn new_file_paths() { + if cfg!(unix) { + assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); + assert_eq!(Url::from_file_path(Path::new("../relative")), Err(())); + } + if cfg!(windows) { + assert_eq!(Url::from_file_path(Path::new("relative")), Err(())); + assert_eq!(Url::from_file_path(Path::new(r"..\relative")), Err(())); + assert_eq!(Url::from_file_path(Path::new(r"\drive-relative")), Err(())); + assert_eq!(Url::from_file_path(Path::new(r"\\ucn\")), Err(())); + } + + if cfg!(unix) { + assert_from_file_path!("/foo/bar"); + assert_from_file_path!("/foo/ba\0r", "/foo/ba%00r"); + assert_from_file_path!("/foo/ba%00r", "/foo/ba%2500r"); + } +} + +#[test] +#[cfg(unix)] +fn new_path_bad_utf8() { + use std::ffi::OsStr; + use std::os::unix::prelude::*; + + let url = Url::from_file_path(Path::new(OsStr::from_bytes(b"/foo/ba\x80r"))).unwrap(); + let os_str = OsStr::from_bytes(b"/foo/ba\x80r"); + assert_eq!(url.to_file_path(), Ok(PathBuf::from(os_str))); +} + +#[test] +fn new_path_windows_fun() { + if cfg!(windows) { + assert_from_file_path!(r"C:\foo\bar", "/C:/foo/bar"); + assert_from_file_path!("C:\\foo\\ba\0r", "/C:/foo/ba%00r"); + + // Invalid UTF-8 + assert!(Url::parse("file:///C:/foo/ba%80r").unwrap().to_file_path().is_err()); + + // test windows canonicalized path + let path = PathBuf::from(r"\\?\C:\foo\bar"); + assert!(Url::from_file_path(path).is_ok()); + } +} + + +#[test] +fn new_directory_paths() { + if cfg!(unix) { + assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); + assert_eq!(Url::from_directory_path(Path::new("../relative")), Err(())); + + let url = Url::from_directory_path(Path::new("/foo/bar")).unwrap(); + assert_eq!(url.host(), None); + assert_eq!(url.path(), "/foo/bar/"); + } + if cfg!(windows) { + assert_eq!(Url::from_directory_path(Path::new("relative")), Err(())); + assert_eq!(Url::from_directory_path(Path::new(r"..\relative")), Err(())); + assert_eq!(Url::from_directory_path(Path::new(r"\drive-relative")), Err(())); + assert_eq!(Url::from_directory_path(Path::new(r"\\ucn\")), Err(())); + + let url = Url::from_directory_path(Path::new(r"C:\foo\bar")).unwrap(); + assert_eq!(url.host(), None); + assert_eq!(url.path(), "/C:/foo/bar/"); + } +} + +#[test] +fn from_str() { + assert!("http://testing.com/this".parse::().is_ok()); +} + +#[test] +fn issue_124() { + let url: Url = "file:a".parse().unwrap(); + assert_eq!(url.path(), "/a"); + let url: Url = "file:...".parse().unwrap(); + assert_eq!(url.path(), "/..."); + let url: Url = "file:..".parse().unwrap(); + assert_eq!(url.path(), "/"); +} + +#[test] +fn test_equality() { + use std::hash::{Hash, Hasher, SipHasher}; + + fn check_eq(a: &Url, b: &Url) { + assert_eq!(a, b); + + let mut h1 = SipHasher::new(); + a.hash(&mut h1); + let mut h2 = SipHasher::new(); + b.hash(&mut h2); + assert_eq!(h1.finish(), h2.finish()); + } + + fn url(s: &str) -> Url { + let rv = s.parse().unwrap(); + check_eq(&rv, &rv); + rv + } + + // Doesn't care if default port is given. + let a: Url = url("https://example.com/"); + let b: Url = url("https://example.com:443/"); + check_eq(&a, &b); + + // Different ports + let a: Url = url("http://example.com/"); + let b: Url = url("http://example.com:8080/"); + assert!(a != b, "{:?} != {:?}", a, b); + + // Different scheme + let a: Url = url("http://example.com/"); + let b: Url = url("https://example.com/"); + assert!(a != b); + + // Different host + let a: Url = url("http://foo.com/"); + let b: Url = url("http://bar.com/"); + assert!(a != b); + + // Missing path, automatically substituted. Semantically the same. + let a: Url = url("http://foo.com"); + let b: Url = url("http://foo.com/"); + check_eq(&a, &b); +} + +#[test] +fn host() { + fn assert_host(input: &str, host: Host<&str>) { + assert_eq!(Url::parse(input).unwrap().host(), Some(host)); + } + assert_host("http://www.mozilla.org", Host::Domain("www.mozilla.org")); + assert_host("http://1.35.33.49", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://[2001:0db8:85a3:08d3:1319:8a2e:0370:7344]", Host::Ipv6(Ipv6Addr::new( + 0x2001, 0x0db8, 0x85a3, 0x08d3, 0x1319, 0x8a2e, 0x0370, 0x7344))); + assert_host("http://1.35.+33.49", Host::Domain("1.35.+33.49")); + assert_host("http://[::]", Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 0))); + assert_host("http://[::1]", Host::Ipv6(Ipv6Addr::new(0, 0, 0, 0, 0, 0, 0, 1))); + assert_host("http://0x1.0X23.0x21.061", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://0x1232131", Host::Ipv4(Ipv4Addr::new(1, 35, 33, 49))); + assert_host("http://111", Host::Ipv4(Ipv4Addr::new(0, 0, 0, 111))); + assert_host("http://2..2.3", Host::Domain("2..2.3")); + assert!(Url::parse("http://42.0x1232131").is_err()); + assert!(Url::parse("http://192.168.0.257").is_err()); +} + +#[test] +fn host_serialization() { + // libstd’s `Display for Ipv6Addr` serializes 0:0:0:0:0:0:_:_ and 0:0:0:0:0:ffff:_:_ + // using IPv4-like syntax, as suggested in https://tools.ietf.org/html/rfc5952#section-4 + // but https://url.spec.whatwg.org/#concept-ipv6-serializer specifies not to. + + // Not [::0.0.0.2] / [::ffff:0.0.0.2] + assert_eq!(Url::parse("http://[0::2]").unwrap().host_str(), Some("[::2]")); + assert_eq!(Url::parse("http://[0::ffff:0:2]").unwrap().host_str(), Some("[::ffff:0:2]")); +} + +#[test] +fn test_idna() { + assert!("http://goșu.ro".parse::().is_ok()); + assert_eq!(Url::parse("http://☃.net/").unwrap().host(), Some(Host::Domain("xn--n3h.net"))); +} + +#[test] +fn test_serialization() { + let data = [ + ("http://example.com/", "http://example.com/"), + ("http://addslash.com", "http://addslash.com/"), + ("http://@emptyuser.com/", "http://emptyuser.com/"), + ("http://:@emptypass.com/", "http://:@emptypass.com/"), + ("http://user@user.com/", "http://user@user.com/"), + ("http://user:pass@userpass.com/", "http://user:pass@userpass.com/"), + ("http://slashquery.com/path/?q=something", "http://slashquery.com/path/?q=something"), + ("http://noslashquery.com/path?q=something", "http://noslashquery.com/path?q=something") + ]; + for &(input, result) in &data { + let url = Url::parse(input).unwrap(); + assert_eq!(url.as_str(), result); + } +} + +#[test] +fn test_form_urlencoded() { + let pairs: &[(Cow, Cow)] = &[ + ("foo".into(), "é&".into()), + ("bar".into(), "".into()), + ("foo".into(), "#".into()) + ]; + let encoded = form_urlencoded::Serializer::new(String::new()).extend_pairs(pairs).finish(); + assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); + assert_eq!(form_urlencoded::parse(encoded.as_bytes()).collect::>(), pairs.to_vec()); +} + +#[test] +fn test_form_serialize() { + let encoded = form_urlencoded::Serializer::new(String::new()) + .append_pair("foo", "é&") + .append_pair("bar", "") + .append_pair("foo", "#") + .finish(); + assert_eq!(encoded, "foo=%C3%A9%26&bar=&foo=%23"); +} diff --git a/tests/urltestdata.json b/tests/urltestdata.json new file mode 100644 index 00000000..bb95804f --- /dev/null +++ b/tests/urltestdata.json @@ -0,0 +1,4228 @@ +[ + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/segments.js", + { + "input": "http://example\t.\norg", + "base": "http://example.org/foo/bar", + "href": "http://example.org/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://user:pass@foo:21/bar;par?b#c", + "base": "http://example.org/foo/bar", + "href": "http://user:pass@foo:21/bar;par?b#c", + "origin": "http://foo:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "foo:21", + "hostname": "foo", + "port": "21", + "pathname": "/bar;par", + "search": "?b", + "hash": "#c" + }, + { + "input": "http:foo.com", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/foo.com", + "search": "", + "hash": "" + }, + { + "input": "\t :foo.com \n", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com", + "search": "", + "hash": "" + }, + { + "input": " foo.com ", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/foo.com", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/foo.com", + "search": "", + "hash": "" + }, + { + "input": "a:\t foo.com", + "base": "http://example.org/foo/bar", + "href": "a: foo.com", + "origin": "null", + "protocol": "a:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": " foo.com", + "search": "", + "hash": "" + }, + { + "input": "http://f:21/ b ? d # e ", + "base": "http://example.org/foo/bar", + "href": "http://f:21/%20b%20?%20d%20# e", + "origin": "http://f:21", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:21", + "hostname": "f", + "port": "21", + "pathname": "/%20b%20", + "search": "?%20d%20", + "hash": "# e" + }, + { + "input": "http://f:/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:0/c", + "base": "http://example.org/foo/bar", + "href": "http://f:0/c", + "origin": "http://f:0", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:0", + "hostname": "f", + "port": "0", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:00000000000000/c", + "base": "http://example.org/foo/bar", + "href": "http://f:0/c", + "origin": "http://f:0", + "protocol": "http:", + "username": "", + "password": "", + "host": "f:0", + "hostname": "f", + "port": "0", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:00000000000000000000080/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:b/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f: /c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f:\n/c", + "base": "http://example.org/foo/bar", + "href": "http://f/c", + "origin": "http://f", + "protocol": "http:", + "username": "", + "password": "", + "host": "f", + "hostname": "f", + "port": "", + "pathname": "/c", + "search": "", + "hash": "" + }, + { + "input": "http://f:fifty-two/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f:999999/c", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://f: 21 / b ? d # e ", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": " \t", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": ":foo.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com/", + "search": "", + "hash": "" + }, + { + "input": ":foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:foo.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:foo.com/", + "search": "", + "hash": "" + }, + { + "input": ":", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:", + "search": "", + "hash": "" + }, + { + "input": ":a", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:a", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:a", + "search": "", + "hash": "" + }, + { + "input": ":/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:/", + "search": "", + "hash": "" + }, + { + "input": ":\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:/", + "search": "", + "hash": "" + }, + { + "input": ":#", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:#", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:", + "search": "", + "hash": "" + }, + { + "input": "#", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "#/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#/" + }, + { + "input": "#\\", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#\\", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#\\" + }, + { + "input": "#;?", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#;?", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#;?" + }, + { + "input": "?", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar?", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": ":23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:23", + "search": "", + "hash": "" + }, + { + "input": "/:23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/:23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/:23", + "search": "", + "hash": "" + }, + { + "input": "::", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/::", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/::", + "search": "", + "hash": "" + }, + { + "input": "::23", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/::23", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/::23", + "search": "", + "hash": "" + }, + { + "input": "foo://", + "base": "http://example.org/foo/bar", + "href": "foo:///", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:b@c:29/d", + "base": "http://example.org/foo/bar", + "href": "http://a:b@c:29/d", + "origin": "http://c:29", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "c:29", + "hostname": "c", + "port": "29", + "pathname": "/d", + "search": "", + "hash": "" + }, + { + "input": "http::@c:29", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/:@c:29", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/:@c:29", + "search": "", + "hash": "" + }, + { + "input": "http://&a:foo(b]c@d:2/", + "base": "http://example.org/foo/bar", + "href": "http://&a:foo(b%5Dc@d:2/", + "origin": "http://d:2", + "protocol": "http:", + "username": "&a", + "password": "foo(b%5Dc", + "host": "d:2", + "hostname": "d", + "port": "2", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://::@c@d:2", + "base": "http://example.org/foo/bar", + "href": "http://:%3A%40c@d:2/", + "origin": "http://d:2", + "protocol": "http:", + "username": "", + "password": "%3A%40c", + "host": "d:2", + "hostname": "d", + "port": "2", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo.com:b@d/", + "base": "http://example.org/foo/bar", + "href": "http://foo.com:b@d/", + "origin": "http://d", + "protocol": "http:", + "username": "foo.com", + "password": "b", + "host": "d", + "hostname": "d", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo.com/\\@", + "base": "http://example.org/foo/bar", + "href": "http://foo.com//@", + "origin": "http://foo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.com", + "hostname": "foo.com", + "port": "", + "pathname": "//@", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://foo.com/", + "origin": "http://foo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo.com", + "hostname": "foo.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\a\\b:c\\d@foo.com\\", + "base": "http://example.org/foo/bar", + "href": "http://a/b:c/d@foo.com/", + "origin": "http://a", + "protocol": "http:", + "username": "", + "password": "", + "host": "a", + "hostname": "a", + "port": "", + "pathname": "/b:c/d@foo.com/", + "search": "", + "hash": "" + }, + { + "input": "foo:/", + "base": "http://example.org/foo/bar", + "href": "foo:/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "foo:/bar.com/", + "base": "http://example.org/foo/bar", + "href": "foo:/bar.com/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/bar.com/", + "search": "", + "hash": "" + }, + { + "input": "foo://///////", + "base": "http://example.org/foo/bar", + "href": "foo://///////", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "///////", + "search": "", + "hash": "" + }, + { + "input": "foo://///////bar.com/", + "base": "http://example.org/foo/bar", + "href": "foo://///////bar.com/", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "///////bar.com/", + "search": "", + "hash": "" + }, + { + "input": "foo:////://///", + "base": "http://example.org/foo/bar", + "href": "foo:////://///", + "origin": "null", + "protocol": "foo:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "//://///", + "search": "", + "hash": "" + }, + { + "input": "c:/foo", + "base": "http://example.org/foo/bar", + "href": "c:/foo", + "origin": "null", + "protocol": "c:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "//foo/bar", + "base": "http://example.org/foo/bar", + "href": "http://foo/bar", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/bar", + "search": "", + "hash": "" + }, + { + "input": "http://foo/path;a??e#f#g", + "base": "http://example.org/foo/bar", + "href": "http://foo/path;a??e#f#g", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/path;a", + "search": "??e", + "hash": "#f#g" + }, + { + "input": "http://foo/abcd?efgh?ijkl", + "base": "http://example.org/foo/bar", + "href": "http://foo/abcd?efgh?ijkl", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/abcd", + "search": "?efgh?ijkl", + "hash": "" + }, + { + "input": "http://foo/abcd#foo?bar", + "base": "http://example.org/foo/bar", + "href": "http://foo/abcd#foo?bar", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/abcd", + "search": "", + "hash": "#foo?bar" + }, + { + "input": "[61:24:74]:98", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/[61:24:74]:98", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/[61:24:74]:98", + "search": "", + "hash": "" + }, + { + "input": "http:[61:27]/:foo", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/[61:27]/:foo", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/[61:27]/:foo", + "search": "", + "hash": "" + }, + { + "input": "http://[1::2]:3:4", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1]", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://2001::1]:80", + "base": "http://example.org/foo/bar", + "failure": true + }, + { + "input": "http://[2001::1]", + "base": "http://example.org/foo/bar", + "href": "http://[2001::1]/", + "origin": "http://[2001::1]", + "protocol": "http:", + "username": "", + "password": "", + "host": "[2001::1]", + "hostname": "[2001::1]", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://[2001::1]:80", + "base": "http://example.org/foo/bar", + "href": "http://[2001::1]/", + "origin": "http://[2001::1]", + "protocol": "http:", + "username": "", + "password": "", + "host": "[2001::1]", + "hostname": "[2001::1]", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/example.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/example.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftp:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:/example.com/", + "base": "http://example.org/foo/bar", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:/example.com/", + "base": "http://example.org/foo/bar", + "href": "madeupscheme:/example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "file:/example.com/", + "base": "http://example.org/foo/bar", + "href": "file:///example.com/", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ftps:/example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:/example.com/", + "base": "http://example.org/foo/bar", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:/example.com/", + "base": "http://example.org/foo/bar", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:/example.com/", + "base": "http://example.org/foo/bar", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/example.com/", + "base": "http://example.org/foo/bar", + "href": "data:/example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/example.com/", + "base": "http://example.org/foo/bar", + "href": "javascript:/example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/example.com/", + "base": "http://example.org/foo/bar", + "href": "mailto:/example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "http:example.com/", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/example.com/", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftp:example.com/", + "base": "http://example.org/foo/bar", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:example.com/", + "base": "http://example.org/foo/bar", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:example.com/", + "base": "http://example.org/foo/bar", + "href": "madeupscheme:example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:example.com/", + "base": "http://example.org/foo/bar", + "href": "ftps:example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:example.com/", + "base": "http://example.org/foo/bar", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:example.com/", + "base": "http://example.org/foo/bar", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:example.com/", + "base": "http://example.org/foo/bar", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:example.com/", + "base": "http://example.org/foo/bar", + "href": "data:example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:example.com/", + "base": "http://example.org/foo/bar", + "href": "javascript:example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:example.com/", + "base": "http://example.org/foo/bar", + "href": "mailto:example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "/a/b/c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/b/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/b/c", + "search": "", + "hash": "" + }, + { + "input": "/a/ /c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/%20/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/%20/c", + "search": "", + "hash": "" + }, + { + "input": "/a%2fc", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a%2fc", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a%2fc", + "search": "", + "hash": "" + }, + { + "input": "/a/%2f/c", + "base": "http://example.org/foo/bar", + "href": "http://example.org/a/%2f/c", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/a/%2f/c", + "search": "", + "hash": "" + }, + { + "input": "#β", + "base": "http://example.org/foo/bar", + "href": "http://example.org/foo/bar#β", + "origin": "http://example.org", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/foo/bar", + "search": "", + "hash": "#β" + }, + { + "input": "data:text/html,test#test", + "base": "http://example.org/foo/bar", + "href": "data:text/html,test#test", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "text/html,test", + "search": "", + "hash": "#test" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/file.html", + { + "input": "file:c:\\foo\\bar.html", + "base": "file:///tmp/mock/path", + "href": "file:///c:/foo/bar.html", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:/foo/bar.html", + "search": "", + "hash": "" + }, + { + "input": " File:c|////foo\\bar.html", + "base": "file:///tmp/mock/path", + "href": "file:///c:////foo/bar.html", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/c:////foo/bar.html", + "search": "", + "hash": "" + }, + { + "input": "C|/foo/bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "/C|\\foo\\bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "//C|/foo/bar", + "base": "file:///tmp/mock/path", + "href": "file:///C:/foo/bar", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/C:/foo/bar", + "search": "", + "hash": "" + }, + { + "input": "//server/file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "\\\\server\\file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "/\\server/file", + "base": "file:///tmp/mock/path", + "href": "file://server/file", + "protocol": "file:", + "username": "", + "password": "", + "host": "server", + "hostname": "server", + "port": "", + "pathname": "/file", + "search": "", + "hash": "" + }, + { + "input": "file:///foo/bar.txt", + "base": "file:///tmp/mock/path", + "href": "file:///foo/bar.txt", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/foo/bar.txt", + "search": "", + "hash": "" + }, + { + "input": "file:///home/me", + "base": "file:///tmp/mock/path", + "href": "file:///home/me", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/home/me", + "search": "", + "hash": "" + }, + { + "input": "//", + "base": "file:///tmp/mock/path", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "///", + "base": "file:///tmp/mock/path", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "///test", + "base": "file:///tmp/mock/path", + "href": "file:///test", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/test", + "search": "", + "hash": "" + }, + { + "input": "file://test", + "base": "file:///tmp/mock/path", + "href": "file://test/", + "protocol": "file:", + "username": "", + "password": "", + "host": "test", + "hostname": "test", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost", + "base": "file:///tmp/mock/path", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost/", + "base": "file:///tmp/mock/path", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file://localhost/test", + "base": "file:///tmp/mock/path", + "href": "file:///test", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/test", + "search": "", + "hash": "" + }, + { + "input": "test", + "base": "file:///tmp/mock/path", + "href": "file:///tmp/mock/test", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/tmp/mock/test", + "search": "", + "hash": "" + }, + { + "input": "file:test", + "base": "file:///tmp/mock/path", + "href": "file:///tmp/mock/test", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/tmp/mock/test", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/path.js", + { + "input": "http://example.com/././foo", + "base": "about:blank", + "href": "http://example.com/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/./.foo", + "base": "about:blank", + "href": "http://example.com/.foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/.foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/.", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/./", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/..", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/..bar", + "base": "about:blank", + "href": "http://example.com/foo/..bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/..bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../ton", + "base": "about:blank", + "href": "http://example.com/foo/ton", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/ton", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar/../ton/../../a", + "base": "about:blank", + "href": "http://example.com/a", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/a", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/../../..", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/../../../ton", + "base": "about:blank", + "href": "http://example.com/ton", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/ton", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e%2", + "base": "about:blank", + "href": "http://example.com/foo/.%2", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/.%2", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/%2e./%2e%2e/.%2e/%2e.bar", + "base": "about:blank", + "href": "http://example.com/..bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/..bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com////../..", + "base": "about:blank", + "href": "http://example.com//", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "//", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar//../..", + "base": "about:blank", + "href": "http://example.com/foo/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo/bar//..", + "base": "about:blank", + "href": "http://example.com/foo/bar/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo/bar/", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo", + "base": "about:blank", + "href": "http://example.com/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%20foo", + "base": "about:blank", + "href": "http://example.com/%20foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%20foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%", + "base": "about:blank", + "href": "http://example.com/foo%", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2", + "base": "about:blank", + "href": "http://example.com/foo%2", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2zbar", + "base": "about:blank", + "href": "http://example.com/foo%2zbar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2zbar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%2©zbar", + "base": "about:blank", + "href": "http://example.com/foo%2%C3%82%C2%A9zbar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%2%C3%82%C2%A9zbar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%41%7a", + "base": "about:blank", + "href": "http://example.com/foo%41%7a", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%41%7a", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo\t\u0091%91", + "base": "about:blank", + "href": "http://example.com/foo%C2%91%91", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%C2%91%91", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo%00%51", + "base": "about:blank", + "href": "http://example.com/foo%00%51", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foo%00%51", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/(%28:%3A%29)", + "base": "about:blank", + "href": "http://example.com/(%28:%3A%29)", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/(%28:%3A%29)", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%3A%3a%3C%3c", + "base": "about:blank", + "href": "http://example.com/%3A%3a%3C%3c", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%3A%3a%3C%3c", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/foo\tbar", + "base": "about:blank", + "href": "http://example.com/foobar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/foobar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com\\\\foo\\\\bar", + "base": "about:blank", + "href": "http://example.com//foo//bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "//foo//bar", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd", + "base": "about:blank", + "href": "http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%7Ffp3%3Eju%3Dduvgw%3Dd", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/@asdf%40", + "base": "about:blank", + "href": "http://example.com/@asdf%40", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/@asdf%40", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/你好你好", + "base": "about:blank", + "href": "http://example.com/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/‥/foo", + "base": "about:blank", + "href": "http://example.com/%E2%80%A5/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E2%80%A5/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com//foo", + "base": "about:blank", + "href": "http://example.com/%EF%BB%BF/foo", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%EF%BB%BF/foo", + "search": "", + "hash": "" + }, + { + "input": "http://example.com/‮/foo/‭/bar", + "base": "about:blank", + "href": "http://example.com/%E2%80%AE/foo/%E2%80%AD/bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/%E2%80%AE/foo/%E2%80%AD/bar", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/relative.js", + { + "input": "http://www.google.com/foo?bar=baz#", + "base": "about:blank", + "href": "http://www.google.com/foo?bar=baz#", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "?bar=baz", + "hash": "" + }, + { + "input": "http://www.google.com/foo?bar=baz# »", + "base": "about:blank", + "href": "http://www.google.com/foo?bar=baz# »", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "?bar=baz", + "hash": "# »" + }, + { + "input": "data:test# »", + "base": "about:blank", + "href": "data:test# »", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "", + "hash": "# »" + }, + { + "input": "http://[www.google.com]/", + "base": "about:blank", + "failure": true + }, + { + "input": "http://www.google.com", + "base": "about:blank", + "href": "http://www.google.com/", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://192.0x00A80001", + "base": "about:blank", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://www/foo%2Ehtml", + "base": "about:blank", + "href": "http://www/foo.html", + "origin": "http://www", + "protocol": "http:", + "username": "", + "password": "", + "host": "www", + "hostname": "www", + "port": "", + "pathname": "/foo.html", + "search": "", + "hash": "" + }, + { + "input": "http://www/foo/%2E/html", + "base": "about:blank", + "href": "http://www/foo/html", + "origin": "http://www", + "protocol": "http:", + "username": "", + "password": "", + "host": "www", + "hostname": "www", + "port": "", + "pathname": "/foo/html", + "search": "", + "hash": "" + }, + { + "input": "http://user:pass@/", + "base": "about:blank", + "failure": true + }, + { + "input": "http://%25DOMAIN:foobar@foodomain.com/", + "base": "about:blank", + "href": "http://%25DOMAIN:foobar@foodomain.com/", + "origin": "http://foodomain.com", + "protocol": "http:", + "username": "%25DOMAIN", + "password": "foobar", + "host": "foodomain.com", + "hostname": "foodomain.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:\\\\www.google.com\\foo", + "base": "about:blank", + "href": "http://www.google.com/foo", + "origin": "http://www.google.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.google.com", + "hostname": "www.google.com", + "port": "", + "pathname": "/foo", + "search": "", + "hash": "" + }, + { + "input": "http://foo:80/", + "base": "about:blank", + "href": "http://foo/", + "origin": "http://foo", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo:81/", + "base": "about:blank", + "href": "http://foo:81/", + "origin": "http://foo:81", + "protocol": "http:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "httpa://foo:80/", + "base": "about:blank", + "href": "httpa://foo:80/", + "origin": "null", + "protocol": "httpa:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://foo:-80/", + "base": "about:blank", + "failure": true + }, + { + "input": "https://foo:443/", + "base": "about:blank", + "href": "https://foo/", + "origin": "https://foo", + "protocol": "https:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https://foo:80/", + "base": "about:blank", + "href": "https://foo:80/", + "origin": "https://foo:80", + "protocol": "https:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp://foo:21/", + "base": "about:blank", + "href": "ftp://foo/", + "origin": "ftp://foo", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp://foo:80/", + "base": "about:blank", + "href": "ftp://foo:80/", + "origin": "ftp://foo:80", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "gopher://foo:70/", + "base": "about:blank", + "href": "gopher://foo/", + "origin": "gopher://foo", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "gopher://foo:443/", + "base": "about:blank", + "href": "gopher://foo:443/", + "origin": "gopher://foo:443", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "foo:443", + "hostname": "foo", + "port": "443", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:80/", + "base": "about:blank", + "href": "ws://foo/", + "origin": "ws://foo", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:81/", + "base": "about:blank", + "href": "ws://foo:81/", + "origin": "ws://foo:81", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:443/", + "base": "about:blank", + "href": "ws://foo:443/", + "origin": "ws://foo:443", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:443", + "hostname": "foo", + "port": "443", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws://foo:815/", + "base": "about:blank", + "href": "ws://foo:815/", + "origin": "ws://foo:815", + "protocol": "ws:", + "username": "", + "password": "", + "host": "foo:815", + "hostname": "foo", + "port": "815", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:80/", + "base": "about:blank", + "href": "wss://foo:80/", + "origin": "wss://foo:80", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:80", + "hostname": "foo", + "port": "80", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:81/", + "base": "about:blank", + "href": "wss://foo:81/", + "origin": "wss://foo:81", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:81", + "hostname": "foo", + "port": "81", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:443/", + "base": "about:blank", + "href": "wss://foo/", + "origin": "wss://foo", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo", + "hostname": "foo", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss://foo:815/", + "base": "about:blank", + "href": "wss://foo:815/", + "origin": "wss://foo:815", + "protocol": "wss:", + "username": "", + "password": "", + "host": "foo:815", + "hostname": "foo", + "port": "815", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/example.com/", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp:/example.com/", + "base": "about:blank", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:/example.com/", + "base": "about:blank", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:/example.com/", + "base": "about:blank", + "href": "madeupscheme:/example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "file:/example.com/", + "base": "about:blank", + "href": "file:///example.com/", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:/example.com/", + "base": "about:blank", + "href": "ftps:/example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:/example.com/", + "base": "about:blank", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:/example.com/", + "base": "about:blank", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:/example.com/", + "base": "about:blank", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/example.com/", + "base": "about:blank", + "href": "data:/example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/example.com/", + "base": "about:blank", + "href": "javascript:/example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/example.com/", + "base": "about:blank", + "href": "mailto:/example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/example.com/", + "search": "", + "hash": "" + }, + { + "input": "http:example.com/", + "base": "about:blank", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ftp:example.com/", + "base": "about:blank", + "href": "ftp://example.com/", + "origin": "ftp://example.com", + "protocol": "ftp:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https:example.com/", + "base": "about:blank", + "href": "https://example.com/", + "origin": "https://example.com", + "protocol": "https:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "madeupscheme:example.com/", + "base": "about:blank", + "href": "madeupscheme:example.com/", + "origin": "null", + "protocol": "madeupscheme:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "ftps:example.com/", + "base": "about:blank", + "href": "ftps:example.com/", + "origin": "null", + "protocol": "ftps:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "gopher:example.com/", + "base": "about:blank", + "href": "gopher://example.com/", + "origin": "gopher://example.com", + "protocol": "gopher:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "ws:example.com/", + "base": "about:blank", + "href": "ws://example.com/", + "origin": "ws://example.com", + "protocol": "ws:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "wss:example.com/", + "base": "about:blank", + "href": "wss://example.com/", + "origin": "wss://example.com", + "protocol": "wss:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:example.com/", + "base": "about:blank", + "href": "data:example.com/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "javascript:example.com/", + "base": "about:blank", + "href": "javascript:example.com/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + { + "input": "mailto:example.com/", + "base": "about:blank", + "href": "mailto:example.com/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "example.com/", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/segments-userinfo-vs-host.html", + { + "input": "http:@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://@www.example.com", + "base": "about:blank", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:b@www.example.com", + "base": "about:blank", + "href": "http://a:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://@pple.com", + "base": "about:blank", + "href": "http://pple.com/", + "origin": "http://pple.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "pple.com", + "hostname": "pple.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http::b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/:b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://:b@www.example.com", + "base": "about:blank", + "href": "http://:b@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "b", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://user@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "https:@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://a:b@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http::@/www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:/a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://a:@www.example.com", + "base": "about:blank", + "href": "http://a:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "a", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://www.@pple.com", + "base": "about:blank", + "href": "http://www.@pple.com/", + "origin": "http://pple.com", + "protocol": "http:", + "username": "www.", + "password": "", + "host": "pple.com", + "hostname": "pple.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http:@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http:/@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://@:www.example.com", + "base": "about:blank", + "failure": true + }, + { + "input": "http://:@www.example.com", + "base": "about:blank", + "href": "http://:@www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# Others", + { + "input": "/", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": ".", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "..", + "base": "http://www.example.com/test", + "href": "http://www.example.com/", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "./test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../aaa/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/aaa/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/aaa/test.txt", + "search": "", + "hash": "" + }, + { + "input": "../../test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/test.txt", + "search": "", + "hash": "" + }, + { + "input": "中/test.txt", + "base": "http://www.example.com/test", + "href": "http://www.example.com/%E4%B8%AD/test.txt", + "origin": "http://www.example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example.com", + "hostname": "www.example.com", + "port": "", + "pathname": "/%E4%B8%AD/test.txt", + "search": "", + "hash": "" + }, + { + "input": "http://www.example2.com", + "base": "http://www.example.com/test", + "href": "http://www.example2.com/", + "origin": "http://www.example2.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example2.com", + "hostname": "www.example2.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "//www.example2.com", + "base": "http://www.example.com/test", + "href": "http://www.example2.com/", + "origin": "http://www.example2.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.example2.com", + "hostname": "www.example2.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:...", + "base": "http://www.example.com/test", + "href": "file:///...", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/...", + "search": "", + "hash": "" + }, + { + "input": "file:..", + "base": "http://www.example.com/test", + "href": "file:///", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "file:a", + "base": "http://www.example.com/test", + "href": "file:///a", + "protocol": "file:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/a", + "search": "", + "hash": "" + }, + "# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/host.html", + "Basic canonicalization, uppercase should be converted to lowercase", + { + "input": "http://ExAmPlE.CoM", + "base": "http://other.com/", + "href": "http://example.com/", + "origin": "http://example.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://example example.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://Goo%20 goo%7C|.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://[]", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://[:]", + "base": "http://other.com/", + "failure": true + }, + "U+3000 is mapped to U+0020 (space) which is disallowed", + { + "input": "http://GOO\u00a0\u3000goo.com", + "base": "http://other.com/", + "failure": true + }, + "Other types of space (no-break, zero-width, zero-width-no-break) are name-prepped away to nothing. U+200B, U+2060, and U+FEFF, are ignored", + { + "input": "http://GOO\u200b\u2060\ufeffgoo.com", + "base": "http://other.com/", + "href": "http://googoo.com/", + "origin": "http://googoo.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "googoo.com", + "hostname": "googoo.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Ideographic full stop (full-width period for Chinese, etc.) should be treated as a dot. U+3002 is mapped to U+002E (dot)", + { + "input": "http://www.foo。bar.com", + "base": "http://other.com/", + "href": "http://www.foo.bar.com/", + "origin": "http://www.foo.bar.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "www.foo.bar.com", + "hostname": "www.foo.bar.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Invalid unicode characters should fail... U+FDD0 is disallowed; %ef%b7%90 is U+FDD0", + { + "input": "http://\ufdd0zyx.com", + "base": "http://other.com/", + "failure": true + }, + "This is the same as previous but escaped", + { + "input": "http://%ef%b7%90zyx.com", + "base": "http://other.com/", + "failure": true + }, + "Test name prepping, fullwidth input should be converted to ASCII and NOT IDN-ized. This is 'Go' in fullwidth UTF-8/UTF-16.", + { + "input": "http://Go.com", + "base": "http://other.com/", + "href": "http://go.com/", + "origin": "http://go.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "go.com", + "hostname": "go.com", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "URL spec forbids the following. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24257", + { + "input": "http://%41.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://%ef%bc%85%ef%bc%94%ef%bc%91.com", + "base": "http://other.com/", + "failure": true + }, + "...%00 in fullwidth should fail (also as escaped UTF-8 input)", + { + "input": "http://%00.com", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://%ef%bc%85%ef%bc%90%ef%bc%90.com", + "base": "http://other.com/", + "failure": true + }, + "Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN", + { + "input": "http://你好你好", + "base": "http://other.com/", + "href": "http://xn--6qqa088eba/", + "origin": "http://你好你好", + "protocol": "http:", + "username": "", + "password": "", + "host": "xn--6qqa088eba", + "hostname": "xn--6qqa088eba", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Invalid escaped characters should fail and the percents should be escaped. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24191", + { + "input": "http://%zz%66%a.com", + "base": "http://other.com/", + "failure": true + }, + "If we get an invalid character that has been escaped.", + { + "input": "http://%25", + "base": "http://other.com/", + "failure": true + }, + { + "input": "http://hello%00", + "base": "http://other.com/", + "failure": true + }, + "Escaped numbers should be treated like IP addresses if they are.", + { + "input": "http://%30%78%63%30%2e%30%32%35%30.01", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://%30%78%63%30%2e%30%32%35%30.01%2e", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "http://192.168.0.257", + "base": "http://other.com/", + "failure": true + }, + "Invalid escaping should trigger the regular host error handling", + { + "input": "http://%3g%78%63%30%2e%30%32%35%30%2E.01", + "base": "http://other.com/", + "failure": true + }, + "Something that isn't exactly an IP should get treated as a host and spaces escaped", + { + "input": "http://192.168.0.1 hello", + "base": "http://other.com/", + "failure": true + }, + "Fullwidth and escaped UTF-8 fullwidth should still be treated as IP", + { + "input": "http://0Xc0.0250.01", + "base": "http://other.com/", + "href": "http://192.168.0.1/", + "origin": "http://192.168.0.1", + "protocol": "http:", + "username": "", + "password": "", + "host": "192.168.0.1", + "hostname": "192.168.0.1", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "Broken IPv6", + { + "input": "http://[google.com]", + "base": "http://other.com/", + "failure": true + }, + "Misc Unicode", + { + "input": "http://foo:💩@example.com/bar", + "base": "http://other.com/", + "href": "http://foo:%F0%9F%92%A9@example.com/bar", + "origin": "http://example.com", + "protocol": "http:", + "username": "foo", + "password": "%F0%9F%92%A9", + "host": "example.com", + "hostname": "example.com", + "port": "", + "pathname": "/bar", + "search": "", + "hash": "" + }, + "# resolving a fragment against any scheme succeeds", + { + "input": "#", + "base": "test:test", + "href": "test:test#", + "origin": "null", + "protocol": "test:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "", + "hash": "" + }, + { + "input": "#x", + "base": "mailto:x@x.com", + "href": "mailto:x@x.com#x", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "x@x.com", + "search": "", + "hash": "#x" + }, + { + "input": "#x", + "base": "data:,", + "href": "data:,#x", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": ",", + "search": "", + "hash": "#x" + }, + { + "input": "#x", + "base": "about:blank", + "href": "about:blank#x", + "origin": "null", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "blank", + "search": "", + "hash": "#x" + }, + { + "input": "#", + "base": "test:test?test", + "href": "test:test?test#", + "origin": "null", + "protocol": "test:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "test", + "search": "?test", + "hash": "" + }, + "# multiple @ in authority state", + { + "input": "https://@test@test@example:800/", + "base": "http://doesnotmatter/", + "href": "https://%40test%40test@example:800/", + "origin": "https://example:800", + "protocol": "https:", + "username": "%40test%40test", + "password": "", + "host": "example:800", + "hostname": "example", + "port": "800", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "https://@@@example", + "base": "http://doesnotmatter/", + "href": "https://%40%40@example/", + "origin": "https://example", + "protocol": "https:", + "username": "%40%40", + "password": "", + "host": "example", + "hostname": "example", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "non-az-09 characters", + { + "input": "http://`{}:`{}@h/`{}?`{}", + "base": "http://doesnotmatter/", + "href": "http://%60%7B%7D:%60%7B%7D@h/%60%7B%7D?`{}", + "origin": "http://h", + "protocol": "http:", + "username": "%60%7B%7D", + "password": "%60%7B%7D", + "host": "h", + "hostname": "h", + "port": "", + "pathname": "/%60%7B%7D", + "search": "?`{}", + "hash": "" + }, + "# Credentials in base", + { + "input": "/some/path", + "base": "http://user@example.org/smth", + "href": "http://user@example.org/some/path", + "origin": "http://example.org", + "protocol": "http:", + "username": "user", + "password": "", + "host": "example.org", + "hostname": "example.org", + "port": "", + "pathname": "/some/path", + "search": "", + "hash": "" + }, + { + "input": "", + "base": "http://user:pass@example.org:21/smth", + "href": "http://user:pass@example.org:21/smth", + "origin": "http://example.org:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "example.org:21", + "hostname": "example.org", + "port": "21", + "pathname": "/smth", + "search": "", + "hash": "" + }, + { + "input": "/some/path", + "base": "http://user:pass@example.org:21/smth", + "href": "http://user:pass@example.org:21/some/path", + "origin": "http://example.org:21", + "protocol": "http:", + "username": "user", + "password": "pass", + "host": "example.org:21", + "hostname": "example.org", + "port": "21", + "pathname": "/some/path", + "search": "", + "hash": "" + }, + "# a set of tests designed by zcorpan for relative URLs with unknown schemes", + { + "input": "i", + "base": "sc:sd", + "failure": true + }, + { + "input": "i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "i", + "base": "sc:/pa/pa", + "href": "sc:/pa/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/i", + "search": "", + "hash": "" + }, + { + "input": "i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "i", + "base": "sc:///pa/pa", + "href": "sc:///pa/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc:sd", + "failure": true + }, + { + "input": "../i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "../i", + "base": "sc:/pa/pa", + "href": "sc:/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "../i", + "base": "sc:///pa/pa", + "href": "sc:///i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc:sd", + "failure": true + }, + { + "input": "/i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "/i", + "base": "sc:/pa/pa", + "href": "sc:/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc://ho/pa", + "href": "sc://ho/i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "/i", + "base": "sc:///pa/pa", + "href": "sc:///i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/i", + "search": "", + "hash": "" + }, + { + "input": "?i", + "base": "sc:sd", + "failure": true + }, + { + "input": "?i", + "base": "sc:sd/sd", + "failure": true + }, + { + "input": "?i", + "base": "sc:/pa/pa", + "href": "sc:/pa/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "?i", + "hash": "" + }, + { + "input": "?i", + "base": "sc://ho/pa", + "href": "sc://ho/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/pa", + "search": "?i", + "hash": "" + }, + { + "input": "?i", + "base": "sc:///pa/pa", + "href": "sc:///pa/pa?i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "?i", + "hash": "" + }, + { + "input": "#i", + "base": "sc:sd", + "href": "sc:sd#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "sd", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:sd/sd", + "href": "sc:sd/sd#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "sd/sd", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:/pa/pa", + "href": "sc:/pa/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc://ho/pa", + "href": "sc://ho/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "ho", + "hostname": "ho", + "port": "", + "pathname": "/pa", + "search": "", + "hash": "#i" + }, + { + "input": "#i", + "base": "sc:///pa/pa", + "href": "sc:///pa/pa#i", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/pa/pa", + "search": "", + "hash": "#i" + }, + "# make sure that relative URL logic works on known typically non-relative schemes too", + { + "input": "about:/../", + "base": "about:blank", + "href": "about:/", + "origin": "null", + "protocol": "about:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "data:/../", + "base": "about:blank", + "href": "data:/", + "origin": "null", + "protocol": "data:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "javascript:/../", + "base": "about:blank", + "href": "javascript:/", + "origin": "null", + "protocol": "javascript:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + { + "input": "mailto:/../", + "base": "about:blank", + "href": "mailto:/", + "origin": "null", + "protocol": "mailto:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# unknown schemes and non-ASCII domains", + { + "input": "sc://ñ.test/", + "base": "about:blank", + "href": "sc://xn--ida.test/", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "xn--ida.test", + "hostname": "xn--ida.test", + "port": "", + "pathname": "/", + "search": "", + "hash": "" + }, + "# unknown schemes and backslashes", + { + "input": "sc:\\../", + "base": "about:blank", + "href": "sc:\\../", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": "\\../", + "search": "", + "hash": "" + }, + "# unknown scheme with path looking like a password", + { + "input": "sc::a@example.net", + "base": "about:blank", + "href": "sc::a@example.net", + "origin": "null", + "protocol": "sc:", + "username": "", + "password": "", + "host": "", + "hostname": "", + "port": "", + "pathname": ":a@example.net", + "search": "", + "hash": "" + }, + "# tests from jsdom/whatwg-url designed for code coverage", + { + "input": "http://127.0.0.1:10100/relative_import.html", + "base": "about:blank", + "href": "http://127.0.0.1:10100/relative_import.html", + "origin": "http://127.0.0.1:10100", + "protocol": "http:", + "username": "", + "password": "", + "host": "127.0.0.1:10100", + "hostname": "127.0.0.1", + "port": "10100", + "pathname": "/relative_import.html", + "search": "", + "hash": "" + }, + { + "input": "http://facebook.com/?foo=%7B%22abc%22", + "base": "about:blank", + "href": "http://facebook.com/?foo=%7B%22abc%22", + "origin": "http://facebook.com", + "protocol": "http:", + "username": "", + "password": "", + "host": "facebook.com", + "hostname": "facebook.com", + "port": "", + "pathname": "/", + "search": "?foo=%7B%22abc%22", + "hash": "" + }, + { + "input": "https://localhost:3000/jqueryui@1.2.3", + "base": "about:blank", + "href": "https://localhost:3000/jqueryui@1.2.3", + "origin": "https://localhost:3000", + "protocol": "https:", + "username": "", + "password": "", + "host": "localhost:3000", + "hostname": "localhost", + "port": "3000", + "pathname": "/jqueryui@1.2.3", + "search": "", + "hash": "" + } +] diff --git a/tests/urltestdata.txt b/tests/urltestdata.txt deleted file mode 100644 index 88a63c18..00000000 --- a/tests/urltestdata.txt +++ /dev/null @@ -1,329 +0,0 @@ -# This file is from https://github.com/w3c/web-platform-tests/blob/master/url/urltestdata.txt -# and used under a 3-clause BSD license. - -# FORMAT NOT DOCUMENTED YET (parser is urltestparser.js) -# https://github.com/w3c/web-platform-tests/blob/master/url/urltestparser.js - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/segments.js -http://example\t.\norg http://example.org/foo/bar s:http h:example.org p:/ -http://user:pass@foo:21/bar;par?b#c s:http u:user pass:pass h:foo port:21 p:/bar;par q:?b f:#c -http:foo.com s:http h:example.org p:/foo/foo.com -\t\s\s\s:foo.com\s\s\s\n s:http h:example.org p:/foo/:foo.com -\sfoo.com\s\s s:http h:example.org p:/foo/foo.com -a:\t\sfoo.com s:a p:\sfoo.com -http://f:21/\sb\s?\sd\s#\se\s s:http h:f port:21 p:/%20b%20 q:?%20d%20 f:#\se -http://f:/c s:http h:f p:/c -http://f:0/c s:http h:f port:0 p:/c -http://f:00000000000000/c s:http h:f port:0 p:/c -http://f:00000000000000000000080/c s:http h:f p:/c -http://f:b/c -http://f:\s/c -http://f:\n/c s:http h:f p:/c -http://f:fifty-two/c -http://f:9999/c s:http h:f port:9999 p:/c -http://f:\s21\s/\sb\s?\sd\s#\se\s - s:http h:example.org p:/foo/bar -\s\s\t s:http h:example.org p:/foo/bar -:foo.com/ s:http h:example.org p:/foo/:foo.com/ -:foo.com\\ s:http h:example.org p:/foo/:foo.com/ -: s:http h:example.org p:/foo/: -:a s:http h:example.org p:/foo/:a -:/ s:http h:example.org p:/foo/:/ -:\\ s:http h:example.org p:/foo/:/ -:# s:http h:example.org p:/foo/: f:# -# s:http h:example.org p:/foo/bar f:# -#/ s:http h:example.org p:/foo/bar f:#/ -#\\ s:http h:example.org p:/foo/bar f:#\\ -#;? s:http h:example.org p:/foo/bar f:#;? -? s:http h:example.org p:/foo/bar q:? -/ s:http h:example.org p:/ -:23 s:http h:example.org p:/foo/:23 -/:23 s:http h:example.org p:/:23 -:: s:http h:example.org p:/foo/:: -::23 s:http h:example.org p:/foo/::23 -foo:// s:foo p:// -http://a:b@c:29/d s:http u:a pass:b h:c port:29 p:/d -http::@c:29 s:http h:example.org p:/foo/:@c:29 -http://&a:foo(b]c@d:2/ s:http u:&a pass:foo(b]c h:d port:2 p:/ -http://::@c@d:2 s:http pass::%40c h:d port:2 p:/ -http://foo.com:b@d/ s:http u:foo.com pass:b h:d p:/ -http://foo.com/\\@ s:http h:foo.com p://@ -http:\\\\foo.com\\ s:http h:foo.com p:/ -http:\\\\a\\b:c\\d@foo.com\\ s:http h:a p:/b:c/d@foo.com/ -foo:/ s:foo p:/ -foo:/bar.com/ s:foo p:/bar.com/ -foo:///////// s:foo p:///////// -foo://///////bar.com/ s:foo p://///////bar.com/ -foo:////:///// s:foo p:////:///// -c:/foo s:c p:/foo -//foo/bar s:http h:foo p:/bar -http://foo/path;a??e#f#g s:http h:foo p:/path;a q:??e f:#f#g -http://foo/abcd?efgh?ijkl s:http h:foo p:/abcd q:?efgh?ijkl -http://foo/abcd#foo?bar s:http h:foo p:/abcd f:#foo?bar -[61:24:74]:98 s:http h:example.org p:/foo/[61:24:74]:98 -http:[61:27]/:foo s:http h:example.org p:/foo/[61:27]/:foo -http://[1::2]:3:4 -http://2001::1 -http://2001::1] -http://2001::1]:80 -http://[2001::1] s:http h:[2001::1] p:/ -http://[2001::1]:80 s:http h:[2001::1] p:/ -http:/example.com/ s:http h:example.org p:/example.com/ -ftp:/example.com/ s:ftp h:example.com p:/ -https:/example.com/ s:https h:example.com p:/ -madeupscheme:/example.com/ s:madeupscheme p:/example.com/ -file:/example.com/ s:file p:/example.com/ -ftps:/example.com/ s:ftps p:/example.com/ -gopher:/example.com/ s:gopher h:example.com p:/ -ws:/example.com/ s:ws h:example.com p:/ -wss:/example.com/ s:wss h:example.com p:/ -data:/example.com/ s:data p:/example.com/ -javascript:/example.com/ s:javascript p:/example.com/ -mailto:/example.com/ s:mailto p:/example.com/ -http:example.com/ s:http h:example.org p:/foo/example.com/ -ftp:example.com/ s:ftp h:example.com p:/ -https:example.com/ s:https h:example.com p:/ -madeupscheme:example.com/ s:madeupscheme p:example.com/ -ftps:example.com/ s:ftps p:example.com/ -gopher:example.com/ s:gopher h:example.com p:/ -ws:example.com/ s:ws h:example.com p:/ -wss:example.com/ s:wss h:example.com p:/ -data:example.com/ s:data p:example.com/ -javascript:example.com/ s:javascript p:example.com/ -mailto:example.com/ s:mailto p:example.com/ -/a/b/c s:http h:example.org p:/a/b/c -/a/\s/c s:http h:example.org p:/a/%20/c -/a%2fc s:http h:example.org p:/a%2fc -/a/%2f/c s:http h:example.org p:/a/%2f/c -#\u03B2 s:http h:example.org p:/foo/bar f:#\u03B2 -data:text/html,test#test s:data p:text/html,test f:#test - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/file.html -file:c:\\foo\\bar.html file:///tmp/mock/path s:file p:/c:/foo/bar.html -\s\sFile:c|////foo\\bar.html s:file p:/c:////foo/bar.html -C|/foo/bar s:file p:/C:/foo/bar -/C|\\foo\\bar s:file p:/C:/foo/bar -//C|/foo/bar s:file p:/C:/foo/bar -//server/file s:file h:server p:/file -\\\\server\\file s:file h:server p:/file -/\\server/file s:file h:server p:/file -file:///foo/bar.txt s:file p:/foo/bar.txt -file:///home/me s:file p:/home/me -// s:file p:/ -/// s:file p:/ -///test s:file p:/test -file://test s:file h:test p:/ -file://localhost s:file h:localhost p:/ -file://localhost/ s:file h:localhost p:/ -file://localhost/test s:file h:localhost p:/test -test s:file p:/tmp/mock/test -file:test s:file p:/tmp/mock/test - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/path.js -http://example.com/././foo about:blank s:http h:example.com p:/foo -http://example.com/./.foo s:http h:example.com p:/.foo -http://example.com/foo/. s:http h:example.com p:/foo/ -http://example.com/foo/./ s:http h:example.com p:/foo/ -http://example.com/foo/bar/.. s:http h:example.com p:/foo/ -http://example.com/foo/bar/../ s:http h:example.com p:/foo/ -http://example.com/foo/..bar s:http h:example.com p:/foo/..bar -http://example.com/foo/bar/../ton s:http h:example.com p:/foo/ton -http://example.com/foo/bar/../ton/../../a s:http h:example.com p:/a -http://example.com/foo/../../.. s:http h:example.com p:/ -http://example.com/foo/../../../ton s:http h:example.com p:/ton -http://example.com/foo/%2e s:http h:example.com p:/foo/ -http://example.com/foo/%2e%2 s:http h:example.com p:/foo/%2e%2 -http://example.com/foo/%2e./%2e%2e/.%2e/%2e.bar s:http h:example.com p:/%2e.bar -http://example.com////../.. s:http h:example.com p:// -http://example.com/foo/bar//../.. s:http h:example.com p:/foo/ -http://example.com/foo/bar//.. s:http h:example.com p:/foo/bar/ -http://example.com/foo s:http h:example.com p:/foo -http://example.com/%20foo s:http h:example.com p:/%20foo -http://example.com/foo% s:http h:example.com p:/foo% -http://example.com/foo%2 s:http h:example.com p:/foo%2 -http://example.com/foo%2zbar s:http h:example.com p:/foo%2zbar -http://example.com/foo%2\u00C2\u00A9zbar s:http h:example.com p:/foo%2%C3%82%C2%A9zbar -http://example.com/foo%41%7a s:http h:example.com p:/foo%41%7a -http://example.com/foo\t\u0091%91 s:http h:example.com p:/foo%C2%91%91 -http://example.com/foo%00%51 s:http h:example.com p:/foo%00%51 -http://example.com/(%28:%3A%29) s:http h:example.com p:/(%28:%3A%29) -http://example.com/%3A%3a%3C%3c s:http h:example.com p:/%3A%3a%3C%3c -http://example.com/foo\tbar s:http h:example.com p:/foobar -http://example.com\\\\foo\\\\bar s:http h:example.com p://foo//bar -http://example.com/%7Ffp3%3Eju%3Dduvgw%3Dd s:http h:example.com p:/%7Ffp3%3Eju%3Dduvgw%3Dd -http://example.com/@asdf%40 s:http h:example.com p:/@asdf%40 -http://example.com/\u4F60\u597D\u4F60\u597D s:http h:example.com p:/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD -http://example.com/\u2025/foo s:http h:example.com p:/%E2%80%A5/foo -http://example.com/\uFEFF/foo s:http h:example.com p:/%EF%BB%BF/foo -http://example.com/\u202E/foo/\u202D/bar s:http h:example.com p:/%E2%80%AE/foo/%E2%80%AD/bar - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/script-tests/relative.js -http://www.google.com/foo?bar=baz# about:blank s:http h:www.google.com p:/foo q:?bar=baz f:# -http://www.google.com/foo?bar=baz#\s\u00BB s:http h:www.google.com p:/foo q:?bar=baz f:#\s%C2%BB -http://[www.google.com]/ -http://www.google.com s:http h:www.google.com p:/ -http://192.0x00A80001 s:http h:192.168.0.1 p:/ -http://www/foo%2Ehtml s:http h:www p:/foo%2Ehtml -http://www/foo/%2E/html s:http h:www p:/foo/html -http://user:pass@/ -http://%25DOMAIN:foobar@foodomain.com/ s:http u:%25DOMAIN pass:foobar h:foodomain.com p:/ -http:\\\\www.google.com\\foo s:http h:www.google.com p:/foo -http://foo:80/ s:http h:foo p:/ -http://foo:81/ s:http h:foo port:81 p:/ -httpa://foo:80/ s:httpa p://foo:80/ -http://foo:-80/ -https://foo:443/ s:https h:foo p:/ -https://foo:80/ s:https h:foo port:80 p:/ -ftp://foo:21/ s:ftp h:foo p:/ -ftp://foo:80/ s:ftp h:foo port:80 p:/ -gopher://foo:70/ s:gopher h:foo p:/ -gopher://foo:443/ s:gopher h:foo port:443 p:/ -ws://foo:80/ s:ws h:foo p:/ -ws://foo:81/ s:ws h:foo port:81 p:/ -ws://foo:443/ s:ws h:foo port:443 p:/ -ws://foo:815/ s:ws h:foo port:815 p:/ -wss://foo:80/ s:wss h:foo port:80 p:/ -wss://foo:81/ s:wss h:foo port:81 p:/ -wss://foo:443/ s:wss h:foo p:/ -wss://foo:815/ s:wss h:foo port:815 p:/ -http:/example.com/ s:http h:example.com p:/ -ftp:/example.com/ s:ftp h:example.com p:/ -https:/example.com/ s:https h:example.com p:/ -madeupscheme:/example.com/ s:madeupscheme p:/example.com/ -file:/example.com/ s:file p:/example.com/ -ftps:/example.com/ s:ftps p:/example.com/ -gopher:/example.com/ s:gopher h:example.com p:/ -ws:/example.com/ s:ws h:example.com p:/ -wss:/example.com/ s:wss h:example.com p:/ -data:/example.com/ s:data p:/example.com/ -javascript:/example.com/ s:javascript p:/example.com/ -mailto:/example.com/ s:mailto p:/example.com/ -http:example.com/ s:http h:example.com p:/ -ftp:example.com/ s:ftp h:example.com p:/ -https:example.com/ s:https h:example.com p:/ -madeupscheme:example.com/ s:madeupscheme p:example.com/ -ftps:example.com/ s:ftps p:example.com/ -gopher:example.com/ s:gopher h:example.com p:/ -ws:example.com/ s:ws h:example.com p:/ -wss:example.com/ s:wss h:example.com p:/ -data:example.com/ s:data p:example.com/ -javascript:example.com/ s:javascript p:example.com/ -mailto:example.com/ s:mailto p:example.com/ - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/segments-userinfo-vs-host.html -http:@www.example.com about:blank s:http h:www.example.com p:/ -http:/@www.example.com s:http h:www.example.com p:/ -http://@www.example.com s:http h:www.example.com p:/ -http:a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http:/a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http://a:b@www.example.com s:http u:a pass:b h:www.example.com p:/ -http://@pple.com s:http h:pple.com p:/ -http::b@www.example.com s:http pass:b h:www.example.com p:/ -http:/:b@www.example.com s:http pass:b h:www.example.com p:/ -http://:b@www.example.com s:http pass:b h:www.example.com p:/ -http:/:@/www.example.com -http://user@/www.example.com -http:@/www.example.com -http:/@/www.example.com -http://@/www.example.com -https:@/www.example.com -http:a:b@/www.example.com -http:/a:b@/www.example.com -http://a:b@/www.example.com -http::@/www.example.com -http:a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http:/a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http://a:@www.example.com s:http u:a pass: h:www.example.com p:/ -http://www.@pple.com s:http u:www. h:pple.com p:/ -http:@:www.example.com -http:/@:www.example.com -http://@:www.example.com -http://:@www.example.com s:http pass: h:www.example.com p:/ - -#Others -/ http://www.example.com/test s:http h:www.example.com p:/ -/test.txt s:http h:www.example.com p:/test.txt -. s:http h:www.example.com p:/ -.. s:http h:www.example.com p:/ -test.txt s:http h:www.example.com p:/test.txt -./test.txt s:http h:www.example.com p:/test.txt -../test.txt s:http h:www.example.com p:/test.txt -../aaa/test.txt s:http h:www.example.com p:/aaa/test.txt -../../test.txt s:http h:www.example.com p:/test.txt -\u4E2D/test.txt s:http h:www.example.com p:/%E4%B8%AD/test.txt -http://www.example2.com s:http h:www.example2.com p:/ - -# Based on http://trac.webkit.org/browser/trunk/LayoutTests/fast/url/host.html - -# Basic canonicalization, uppercase should be converted to lowercase -http://ExAmPlE.CoM http://other.com/ s:http p:/ h:example.com - -# Spaces should fail -http://example\sexample.com - -# This should fail -http://Goo%20\sgoo%7C|.com - -# This should fail -http://GOO\u00a0\u3000goo.com - -# This should fail -http://[] -http://[:] - -# Other types of space (no-break, zero-width, zero-width-no-break) are -# name-prepped away to nothing. -http://GOO\u200b\u2060\ufeffgoo.com s:http p:/ h:googoo.com - -# Ideographic full stop (full-width period for Chinese, etc.) should be -# treated as a dot. -http://www.foo\u3002bar.com s:http p:/ h:www.foo.bar.com - -# Invalid unicode characters should fail... -http://\ufdd0zyx.com - -# ...This is the same as previous but with with escaped. -http://%ef%b7%90zyx.com - -# Test name prepping, fullwidth input should be converted to ASCII and NOT -# IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16. -http://\uff27\uff4f.com s:http p:/ h:go.com - -# URL spec forbids the following. -# https://www.w3.org/Bugs/Public/show_bug.cgi?id=24257 -http://\uff05\uff14\uff11.com -http://%ef%bc%85%ef%bc%94%ef%bc%91.com - -# ...%00 in fullwidth should fail (also as escaped UTF-8 input) -http://\uff05\uff10\uff10.com -http://%ef%bc%85%ef%bc%90%ef%bc%90.com - -# Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN -http://\u4f60\u597d\u4f60\u597d s:http p:/ h:xn--6qqa088eba - -# Invalid escaped characters should fail and the percents should be -# escaped. https://www.w3.org/Bugs/Public/show_bug.cgi?id=24191 -http://%zz%66%a.com - -# If we get an invalid character that has been escaped. -http://%25 -http://hello%00 - -# Escaped numbers should be treated like IP addresses if they are. -XFAIL http://%30%78%63%30%2e%30%32%35%30.01 s:http p:/ h:127.0.0.1 -XFAIL http://%30%78%63%30%2e%30%32%35%30.01%2e - -# Invalid escaping should trigger the regular host error handling. -http://%3g%78%63%30%2e%30%32%35%30%2E.01 - -# Something that isn't exactly an IP should get treated as a host and -# spaces escaped. -http://192.168.0.1\shello - -# Fullwidth and escaped UTF-8 fullwidth should still be treated as IP. -# These are "0Xc0.0250.01" in fullwidth. -http://\uff10\uff38\uff43\uff10\uff0e\uff10\uff12\uff15\uff10\uff0e\uff10\uff11 s:http p:/ h:192.168.0.1 - -# Broken IP addresses. -XFAIL http://192.168.0.257 -http://[google.com] diff --git a/tests/wpt.rs b/tests/wpt.rs deleted file mode 100644 index 6a32287b..00000000 --- a/tests/wpt.rs +++ /dev/null @@ -1,223 +0,0 @@ -// Copyright 2013-2014 Simon Sapin. -// -// Licensed under the Apache License, Version 2.0 or the MIT license -// , at your -// option. This file may not be copied, modified, or distributed -// except according to those terms. - -//! Tests copied form https://github.com/w3c/web-platform-tests/blob/master/url/ - -extern crate test; -extern crate url; - -use std::char; -use url::{RelativeSchemeData, SchemeData, Url}; - - -fn run_one(entry: Entry) { - // FIXME: Don’t re-indent to make merging the 1.0 branch easier. - { - let Entry { - input, - base, - scheme: expected_scheme, - username: expected_username, - password: expected_password, - host: expected_host, - port: expected_port, - path: expected_path, - query: expected_query, - fragment: expected_fragment, - expected_failure, - } = entry; - let base = match Url::parse(&base) { - Ok(base) => base, - Err(message) => panic!("Error parsing base {}: {}", base, message) - }; - let url = base.join(&input); - if expected_scheme.is_none() { - if url.is_ok() && !expected_failure { - panic!("Expected a parse error for URL {}", input); - } - return - } - let Url { scheme, scheme_data, query, fragment, .. } = match url { - Ok(url) => url, - Err(message) => { - if expected_failure { - return - } else { - panic!("Error parsing URL {}: {}", input, message) - } - } - }; - - macro_rules! assert_eq { - ($a: expr, $b: expr) => { - { - let a = $a; - let b = $b; - if a != b { - if expected_failure { - return - } else { - panic!("{:?} != {:?}", a, b) - } - } - } - } - } - - assert_eq!(Some(scheme), expected_scheme); - match scheme_data { - SchemeData::Relative(RelativeSchemeData { - username, password, host, port, default_port: _, path, - }) => { - assert_eq!(username, expected_username); - assert_eq!(password, expected_password); - let host = host.serialize(); - assert_eq!(host, expected_host); - assert_eq!(port, expected_port); - assert_eq!(Some(format!("/{}", str_join(&path, "/"))), expected_path); - }, - SchemeData::NonRelative(scheme_data) => { - assert_eq!(Some(scheme_data), expected_path); - assert_eq!(String::new(), expected_username); - assert_eq!(None, expected_password); - assert_eq!(String::new(), expected_host); - assert_eq!(None, expected_port); - }, - } - fn opt_prepend(prefix: &str, opt_s: Option) -> Option { - opt_s.map(|s| format!("{}{}", prefix, s)) - } - assert_eq!(opt_prepend("?", query), expected_query); - assert_eq!(opt_prepend("#", fragment), expected_fragment); - - assert!(!expected_failure, "Unexpected success for {}", input); - } -} - -// FIMXE: Remove this when &[&str]::join (the new name) lands in the stable channel. -#[allow(deprecated)] -fn str_join>(pieces: &[T], separator: &str) -> String { - pieces.connect(separator) -} - -struct Entry { - input: String, - base: String, - scheme: Option, - username: String, - password: Option, - host: String, - port: Option, - path: Option, - query: Option, - fragment: Option, - expected_failure: bool, -} - -fn parse_test_data(input: &str) -> Vec { - let mut tests: Vec = Vec::new(); - for line in input.lines() { - if line == "" || line.starts_with("#") { - continue - } - let mut pieces = line.split(' ').collect::>(); - let expected_failure = pieces[0] == "XFAIL"; - if expected_failure { - pieces.remove(0); - } - let input = unescape(pieces.remove(0)); - let mut test = Entry { - input: input, - base: if pieces.is_empty() || pieces[0] == "" { - tests.last().unwrap().base.clone() - } else { - unescape(pieces.remove(0)) - }, - scheme: None, - username: String::new(), - password: None, - host: String::new(), - port: None, - path: None, - query: None, - fragment: None, - expected_failure: expected_failure, - }; - for piece in pieces { - if piece == "" || piece.starts_with("#") { - continue - } - let colon = piece.find(':').unwrap(); - let value = unescape(&piece[colon + 1..]); - match &piece[..colon] { - "s" => test.scheme = Some(value), - "u" => test.username = value, - "pass" => test.password = Some(value), - "h" => test.host = value, - "port" => test.port = Some(value.parse().unwrap()), - "p" => test.path = Some(value), - "q" => test.query = Some(value), - "f" => test.fragment = Some(value), - _ => panic!("Invalid token") - } - } - tests.push(test) - } - tests -} - -fn unescape(input: &str) -> String { - let mut output = String::new(); - let mut chars = input.chars(); - loop { - match chars.next() { - None => return output, - Some(c) => output.push( - if c == '\\' { - match chars.next().unwrap() { - '\\' => '\\', - 'n' => '\n', - 'r' => '\r', - 's' => ' ', - 't' => '\t', - 'f' => '\x0C', - 'u' => { - char::from_u32(((( - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()) * 16 + - chars.next().unwrap().to_digit(16).unwrap()).unwrap() - } - _ => panic!("Invalid test data input"), - } - } else { - c - } - ) - } - } -} - -fn make_test(entry: Entry) -> test::TestDescAndFn { - test::TestDescAndFn { - desc: test::TestDesc { - name: test::DynTestName(format!("{:?} base {:?}", entry.input, entry.base)), - ignore: false, - should_panic: test::ShouldPanic::No, - }, - testfn: test::TestFn::dyn_test_fn(move || run_one(entry)), - } - -} - -fn main() { - test::test_main( - &std::env::args().collect::>(), - parse_test_data(include_str!("urltestdata.txt")).into_iter().map(make_test).collect(), - ) -}