From 34fc56aae29a0e76b24f4f1fdcead58613bc7408 Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Thu, 28 Apr 2016 15:12:20 +0200 Subject: [PATCH 1/3] Test that serialization + parsing is (mostly) idempotent. --- src/lib.rs | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/src/lib.rs b/src/lib.rs index 12659418..96936507 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -346,6 +346,21 @@ impl Url { if let (Some(query_start), Some(fragment_start)) = (self.query_start, self.fragment_start) { assert!(fragment_start > query_start); } + + let other = Url::parse(self.as_str()).unwrap(); + assert_eq!(&self.serialization, &other.serialization); + assert_eq!(self.scheme_end, other.scheme_end); + assert_eq!(self.username_end, other.username_end); + assert_eq!(self.host_start, other.host_start); + assert_eq!(self.host_end, other.host_end); + assert!(self.host == other.host || + // XXX No host round-trips to empty host. + // See https://github.com/whatwg/url/issues/79 + (self.host_str(), other.host_str()) == (None, Some(""))); + assert_eq!(self.port, other.port); + assert_eq!(self.path_start, other.path_start); + assert_eq!(self.query_start, other.query_start); + assert_eq!(self.fragment_start, other.fragment_start); } /// Return the origin of this URL (https://url.spec.whatwg.org/#origin) From 52ce55f570ed8918cdd099e479848edadea6fc8e Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Mon, 2 May 2016 19:51:26 +0200 Subject: [PATCH 2/3] Add Url::path_segments_mut, fix #188 --- Cargo.toml | 2 +- src/lib.rs | 95 ++++++++---------------- src/path_segments.rs | 172 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 203 insertions(+), 66 deletions(-) create mode 100644 src/path_segments.rs diff --git a/Cargo.toml b/Cargo.toml index 452f3345..d8c501dc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "url" -version = "1.0.1" +version = "1.1.0" authors = ["The rust-url developers"] description = "URL library for Rust, based on the WHATWG URL Standard" diff --git a/src/lib.rs b/src/lib.rs index 96936507..f1f487c2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -142,12 +142,14 @@ use std::str; pub use origin::{Origin, OpaqueOrigin}; pub use host::{Host, HostAndPort, SocketAddrs}; +pub use path_segments::PathSegmentsMut; pub use parser::ParseError; pub use slicing::Position; mod encoding; mod host; mod origin; +mod path_segments; mod parser; mod slicing; @@ -261,7 +263,7 @@ impl Url { self.serialization } - /// For internal testing. + /// For internal testing, not part of the public API. /// /// Methods of the `Url` struct assume a number of invariants. /// This checks each of these invariants and panic if one is not met. @@ -545,7 +547,10 @@ impl Url { /// return an iterator of '/' slash-separated path segments, /// each as a percent-encoded ASCII string. /// - /// Return `None` for cannot-be-a-base URLs, or an iterator of at least one string. + /// Return `None` for cannot-be-a-base URLs. + /// + /// When `Some` is returned, the iterator always contains at least one string + /// (which may be empty). pub fn path_segments(&self) -> Option> { let path = self.path(); if path.starts_with('/') { @@ -694,12 +699,16 @@ impl Url { form_urlencoded::Serializer::for_suffix(query, query_start + "?".len()) } - /// Change this URL’s path. - pub fn set_path(&mut self, mut path: &str) { - let (old_after_path_pos, after_path) = match (self.query_start, self.fragment_start) { + fn take_after_path(&mut self) -> (u32, String) { + match (self.query_start, self.fragment_start) { (Some(i), _) | (None, Some(i)) => (i, self.slice(i..).to_owned()), (None, None) => (to_u32(self.serialization.len()).unwrap(), String::new()) - }; + } + } + + /// Change this URL’s path. + pub fn set_path(&mut self, mut path: &str) { + let (old_after_path_pos, after_path) = self.take_after_path(); let cannot_be_a_base = self.cannot_be_a_base(); let scheme_type = SchemeType::from(self.scheme()); self.serialization.truncate(self.path_start as usize); @@ -715,73 +724,29 @@ impl Url { parser.parse_path_start(scheme_type, &mut has_host, parser::Input::new(path)); } }); - let new_after_path_pos = to_u32(self.serialization.len()).unwrap(); - let adjust = |index: &mut u32| { - *index -= old_after_path_pos; - *index += new_after_path_pos; - }; - if let Some(ref mut index) = self.query_start { adjust(index) } - if let Some(ref mut index) = self.fragment_start { adjust(index) } - self.serialization.push_str(&after_path) + self.restore_after_path(old_after_path_pos, &after_path); } - /// Remove the last segment of this URL’s path. + /// Return an object with methods to manipulate this URL’s path segments. /// - /// If this URL is cannot-be-a-base, do nothing and return `Err`. - /// If this URL is not cannot-be-a-base and its path is `/`, do nothing and return `Ok`. - // Temporarily private: https://github.com/servo/rust-url/issues/188 - /*pub*/ fn pop_path_segment(&mut self) -> Result<(), ()> { + /// Return `Err(())` if this URl is cannot-be-a-base. + pub fn path_segments_mut(&mut self) -> Result { if self.cannot_be_a_base() { - return Err(()) - } - let last_slash; - let path_len; - { - let path = self.path(); - last_slash = path.rfind('/').unwrap(); - path_len = path.len(); - }; - if last_slash > 0 { - // Found a slash other than the initial one - let last_slash = last_slash + self.path_start as usize; - let path_end = path_len + self.path_start as usize; - self.serialization.drain(last_slash..path_end); - let offset = (path_end - last_slash) as u32; - if let Some(ref mut index) = self.query_start { *index -= offset } - if let Some(ref mut index) = self.fragment_start { *index -= offset } + Err(()) + } else { + Ok(path_segments::new(self)) } - Ok(()) } - /// Add a segment at the end of this URL’s path. - /// - /// If this URL is cannot-be-a-base, do nothing and return `Err`. - // Temporarily private: https://github.com/servo/rust-url/issues/188 - /*pub*/ fn push_path_segment(&mut self, segment: &str) -> Result<(), ()> { - if self.cannot_be_a_base() { - return Err(()) - } - let after_path = match (self.query_start, self.fragment_start) { - (Some(i), _) | (None, Some(i)) => { - let s = self.slice(i..).to_owned(); - self.serialization.truncate(i as usize); - s - }, - (None, None) => String::new() + fn restore_after_path(&mut self, old_after_path_position: u32, after_path: &str) { + let new_after_path_position = to_u32(self.serialization.len()).unwrap(); + let adjust = |index: &mut u32| { + *index -= old_after_path_position; + *index += new_after_path_position; }; - let scheme_type = SchemeType::from(self.scheme()); - let path_start = self.path_start as usize; - self.serialization.push('/'); - self.mutate(|parser| { - parser.context = parser::Context::PathSegmentSetter; - let mut has_host = true; // FIXME account for this? - parser.parse_path(scheme_type, &mut has_host, path_start, parser::Input::new(segment)) - }); - let offset = to_u32(self.serialization.len()).unwrap() - self.path_start; - if let Some(ref mut index) = self.query_start { *index += offset } - if let Some(ref mut index) = self.fragment_start { *index += offset } - self.serialization.push_str(&after_path); - Ok(()) + if let Some(ref mut index) = self.query_start { adjust(index) } + if let Some(ref mut index) = self.fragment_start { adjust(index) } + self.serialization.push_str(after_path) } /// Change this URL’s port number. diff --git a/src/path_segments.rs b/src/path_segments.rs new file mode 100644 index 00000000..f46c3724 --- /dev/null +++ b/src/path_segments.rs @@ -0,0 +1,172 @@ +// Copyright 2016 The rust-url developers. +// +// Licensed under the Apache License, Version 2.0 or the MIT license +// , at your +// option. This file may not be copied, modified, or distributed +// except according to those terms. + +use parser::{self, SchemeType}; +use std::str; +use Url; + +/// Exposes methods to manipulate the path of an URL that is not cannot-be-base. +/// +/// The path always starts with a `/` slash, and is made of slash-separated segments. +/// There is always at least one segment (which may be the empty string). +/// +/// Examples: +/// +/// ```rust +/// # use url::Url; +/// let mut url = Url::parse("mailto:me@example.com").unwrap(); +/// assert!(url.path_segments_mut().is_err()); +/// +/// let mut url = Url::parse("http://example.net/foo/index.html").unwrap(); +/// url.path_segments_mut().unwrap().pop().push("img").push("100%.png"); +/// assert_eq!(url.as_str(), "http://example.net/foo/img/100%25.png"); +/// ``` +pub struct PathSegmentsMut<'a> { + url: &'a mut Url, + after_first_slash: usize, + after_path: String, + old_after_path_position: u32, +} + +// Not re-exported outside the crate +pub fn new(url: &mut Url) -> PathSegmentsMut { + let (old_after_path_position, after_path) = url.take_after_path(); + debug_assert!(url.byte_at(url.path_start) == b'/'); + PathSegmentsMut { + after_first_slash: url.path_start as usize + "/".len(), + url: url, + old_after_path_position: old_after_path_position, + after_path: after_path, + } +} + +impl<'a> Drop for PathSegmentsMut<'a> { + fn drop(&mut self) { + self.url.restore_after_path(self.old_after_path_position, &self.after_path) + } +} + +impl<'a> PathSegmentsMut<'a> { + /// Remove all segments in the path, leaving the minimal `url.path() == "/"`. + /// + /// Returns `&mut Self` so that method calls can be chained. + /// + /// Example: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://github.com/servo/rust-url/").unwrap(); + /// url.path_segments_mut().unwrap().clear().push("logout"); + /// assert_eq!(url.as_str(), "https://github.com/logout"); + /// ``` + pub fn clear(&mut self) -> &mut Self { + self.url.serialization.truncate(self.after_first_slash); + self + } + + /// Remove the last segment of this URL’s path if it is empty, + /// except if these was only one segment to begin with. + /// + /// In other words, remove one path trailing slash, if any, + /// unless it is also the initial slash (so this does nothing if `url.path() == "/")`. + /// + /// Returns `&mut Self` so that method calls can be chained. + /// + /// Example: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://github.com/servo/rust-url/").unwrap(); + /// url.path_segments_mut().unwrap().push("pulls"); + /// assert_eq!(url.as_str(), "https://github.com/servo/rust-url//pulls"); + /// + /// let mut url = Url::parse("https://github.com/servo/rust-url/").unwrap(); + /// url.path_segments_mut().unwrap().pop_if_empty().push("pulls"); + /// assert_eq!(url.as_str(), "https://github.com/servo/rust-url/pulls"); + /// ``` + pub fn pop_if_empty(&mut self) -> &mut Self { + if self.url.serialization[self.after_first_slash..].ends_with('/') { + self.url.serialization.pop(); + } + self + } + + /// Remove the last segment of this URL’s path. + /// + /// If the path only has one segment, make it empty such that `url.path() == "/"`. + /// + /// Returns `&mut Self` so that method calls can be chained. + pub fn pop(&mut self) -> &mut Self { + let last_slash = self.url.serialization[self.after_first_slash..].rfind('/').unwrap_or(0); + self.url.serialization.truncate(self.after_first_slash + last_slash); + self + } + + /// Append the given segment at the end of this URL’s path. + /// + /// See the documentation for `.extend()`. + /// + /// Returns `&mut Self` so that method calls can be chained. + pub fn push(&mut self, segment: &str) -> &mut Self { + self.extend(Some(segment)) + } + + /// Append each segment from the given iterator at the end of this URL’s path. + /// + /// Each segment is percent-encoded like in `Url::parse` or `Url::join`, + /// except that `%` and `/` characters are also encoded (to `%25` and `%2F`). + /// This is unlike `Url::parse` where `%` is left as-is in case some of the input + /// is already percent-encoded, and `/` denotes a path segment separator.) + /// + /// Note that, in addition to slashes between new segments, + /// this always adds a slash between the existing path and the new segments + /// *except* if the existing path is `"/"`. + /// If the previous last segment was empty (if the path had a trailing slash) + /// the path after `.extend()` will contain two consecutive slashes. + /// If that is undesired, call `.pop_if_empty()` first. + /// + /// To obtain a behavior similar to `Url::join`, call `.pop()` unconditionally first. + /// + /// Returns `&mut Self` so that method calls can be chained. + /// + /// Example: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://github.com/").unwrap(); + /// let org = "servo"; + /// let repo = "rust-url"; + /// let issue_number = "188"; + /// url.path_segments_mut().unwrap().extend(&[org, repo, "issues", issue_number]); + /// assert_eq!(url.as_str(), "https://github.com/servo/rust-url/issues/188"); + /// ``` + pub fn extend(&mut self, segments: I) -> &mut Self + where I: IntoIterator, I::Item: AsRef { + let scheme_type = SchemeType::from(self.url.scheme()); + let path_start = self.url.path_start as usize; + self.url.mutate(|parser| { + parser.context = parser::Context::PathSegmentSetter; + for segment in segments { + if parser.serialization.len() > path_start + 1 { + parser.serialization.push('/'); + } + let mut has_host = true; // FIXME account for this? + parser.parse_path(scheme_type, &mut has_host, path_start, + parser::Input::new(segment.as_ref())); + } + }); + self + } + + /// For internal testing, not part of the public API. + #[doc(hidden)] + pub fn assert_url_invariants(&mut self) -> &mut Self { + self.url.assert_invariants(); + self + } +} From e66486a7cd29066614208c7746e57e930b60d48f Mon Sep 17 00:00:00 2001 From: Simon Sapin Date: Tue, 3 May 2016 09:00:19 +0200 Subject: [PATCH 3/3] Make pushing a "." or ".." segment a no-op. --- src/path_segments.rs | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/path_segments.rs b/src/path_segments.rs index f46c3724..a624ff39 100644 --- a/src/path_segments.rs +++ b/src/path_segments.rs @@ -23,8 +23,8 @@ use Url; /// assert!(url.path_segments_mut().is_err()); /// /// let mut url = Url::parse("http://example.net/foo/index.html").unwrap(); -/// url.path_segments_mut().unwrap().pop().push("img").push("100%.png"); -/// assert_eq!(url.as_str(), "http://example.net/foo/img/100%25.png"); +/// url.path_segments_mut().unwrap().pop().push("img").push("2/100%.png"); +/// assert_eq!(url.as_str(), "http://example.net/foo/img/2%2F100%25.png"); /// ``` pub struct PathSegmentsMut<'a> { url: &'a mut Url, @@ -145,6 +145,16 @@ impl<'a> PathSegmentsMut<'a> { /// url.path_segments_mut().unwrap().extend(&[org, repo, "issues", issue_number]); /// assert_eq!(url.as_str(), "https://github.com/servo/rust-url/issues/188"); /// ``` + /// + /// In order to make sure that parsing the serialization of an URL gives the same URL, + /// a segment is ignored if it is `"."` or `".."`: + /// + /// ```rust + /// # use url::Url; + /// let mut url = Url::parse("https://github.com/servo").unwrap(); + /// url.path_segments_mut().unwrap().extend(&["..", "rust-url", ".", "pulls"]); + /// assert_eq!(url.as_str(), "https://github.com/servo/rust-url/pulls"); + /// ``` pub fn extend(&mut self, segments: I) -> &mut Self where I: IntoIterator, I::Item: AsRef { let scheme_type = SchemeType::from(self.url.scheme()); @@ -152,12 +162,16 @@ impl<'a> PathSegmentsMut<'a> { self.url.mutate(|parser| { parser.context = parser::Context::PathSegmentSetter; for segment in segments { + let segment = segment.as_ref(); + if matches!(segment, "." | "..") { + continue + } if parser.serialization.len() > path_start + 1 { parser.serialization.push('/'); } let mut has_host = true; // FIXME account for this? parser.parse_path(scheme_type, &mut has_host, path_start, - parser::Input::new(segment.as_ref())); + parser::Input::new(segment)); } }); self