From 60441172775f9f04f99e52265dfde9cb71b30b0e Mon Sep 17 00:00:00 2001 From: Sam Macbeth Date: Thu, 4 Jul 2019 16:30:35 +0200 Subject: [PATCH 1/3] Use cliqz-url-parser for URL processing. --- package.json | 1 + src/utils/utils.js | 32 ++++++++++++++++++++------------ yarn.lock | 7 +++++++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/package.json b/package.json index 64899c13b..dc6fa34ea 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,7 @@ }, "homepage": "https://github.com/ghostery/ghostery-extension#readme", "dependencies": { + "@cliqz/url-parser": "^1.0.2", "base64-js": "^1.2.1", "browser-core": "https://github.com/cliqz-oss/browser-core/releases/download/v7.37.4/browser-core-7.37.4.tgz", "classnames": "^2.2.5", diff --git a/src/utils/utils.js b/src/utils/utils.js index 5d347c893..8397281fa 100644 --- a/src/utils/utils.js +++ b/src/utils/utils.js @@ -19,7 +19,7 @@ * @namespace BackgroundUtils */ import { debounce } from 'underscore'; -import url from 'url'; +import { URL } from '@cliqz/url-parser'; import tabInfo from '../classes/TabInfo'; import globals from '../classes/Globals'; import { log, objectEntries } from './common'; @@ -168,7 +168,16 @@ export function processFpeUrl(src) { * @return {Object} contains url parts as properties */ export function processUrl(src) { - if (!src) { + try { + const res = new URL(src); + return { + protocol: res.protocol ? res.protocol.substr(0, res.protocol.length - 1) : '', + host: res.hostname || '', + path: res.pathname ? res.pathname.substr(1) : '', + host_with_path: (res.host || '') + (res.pathname || ''), + anchor: res.hash ? res.hash.substr(1) : '', + }; + } catch (e) { return { protocol: '', host: '', @@ -177,15 +186,6 @@ export function processUrl(src) { anchor: '', }; } - const res = url.parse(src); - - return { - protocol: res.protocol ? res.protocol.substr(0, res.protocol.length - 1) : '', - host: res.hostname || '', - path: res.pathname ? res.pathname.substr(1) : '', - host_with_path: (res.host || '') + (res.pathname || ''), - anchor: res.hash ? res.hash.substr(1) : '', - }; } /** @@ -199,7 +199,15 @@ export function processUrlQuery(src) { return {}; } - return url.parse(src, true).query; + try { + const res = {}; + for (const [key, value] of new URL(src).searchParams.entries()) { + res[key] = value; + } + return res; + } catch (e) { + return {}; + } } /** diff --git a/yarn.lock b/yarn.lock index c0e33caec..8feca6100 100644 --- a/yarn.lock +++ b/yarn.lock @@ -311,6 +311,13 @@ tslib "^1.9.3" tsmaz "^1.2.1" +"@cliqz/url-parser@^1.0.2": + version "1.0.2" + resolved "https://registry.yarnpkg.com/@cliqz/url-parser/-/url-parser-1.0.2.tgz#0c42d73dbe354efad572d9ef39c0aa5d7f6151ea" + integrity sha512-4Y5DQqUv41SWoP7nDRO9PBMH0sSor6aiBT/t1wvjCrUDpG4yhjvAvwRaBN2GoGuHPkWUxT7mZN/FKWtSRZ/FIQ== + dependencies: + tldts "^5.0.3" + "@cnakazawa/watch@^1.0.3": version "1.0.3" resolved "https://registry.yarnpkg.com/@cnakazawa/watch/-/watch-1.0.3.tgz#099139eaec7ebf07a27c1786a3ff64f39464d2ef" From b1628e7a154f5107ca38a70038e96b46c2e6eb11 Mon Sep 17 00:00:00 2001 From: Sam Macbeth Date: Fri, 5 Jul 2019 10:42:55 +0200 Subject: [PATCH 2/3] Second iteration: Use URL object directly, benefit from lazy properties. --- src/classes/EventHandlers.js | 8 ++++---- src/classes/PanelData.js | 4 ++-- src/classes/Policy.js | 4 ++-- src/classes/TabInfo.js | 8 ++++---- src/utils/click2play.js | 4 ++-- src/utils/matcher.js | 8 ++++---- src/utils/utils.js | 17 +++++------------ 7 files changed, 23 insertions(+), 30 deletions(-) diff --git a/src/classes/EventHandlers.js b/src/classes/EventHandlers.js index 157365216..9173db3f5 100644 --- a/src/classes/EventHandlers.js +++ b/src/classes/EventHandlers.js @@ -378,7 +378,7 @@ class EventHandlers { /* ** SMART BLOCKING - Privacy ** */ // block HTTP request on HTTPS page - if (this.policySmartBlock.isInsecureRequest(tab_id, page_protocol, processed.protocol, processed.host)) { + if (this.policySmartBlock.isInsecureRequest(tab_id, page_protocol, processed.scheme, processed.hostname)) { return this._blockHelper(details, tab_id, null, null, request_id, from_redirect, true); } @@ -399,7 +399,7 @@ class EventHandlers { /* ** SMART BLOCKING - Breakage ** */ // allow first party trackers - if (this.policySmartBlock.isFirstPartyRequest(tab_id, page_host, processed.host)) { + if (this.policySmartBlock.isFirstPartyRequest(tab_id, page_host, processed.hostname)) { return { cancel: false }; } @@ -715,11 +715,11 @@ class EventHandlers { * * @private * - * @param {Object} parsedURL + * @param {URL} parsedURL * @return {Boolean} */ _isValidUrl(parsedURL) { - if (parsedURL.protocol.startsWith('http') && parsedURL.host.includes('.') && /[A-Za-z]/.test(parsedURL.host) && !parsedURL.path.includes('_/chrome/newtab')) { + if (parsedURL && parsedURL.protocol.startsWith('http') && parsedURL.isValidHost() && !parsedURL.pathname.includes('_/chrome/newtab')) { return true; } diff --git a/src/classes/PanelData.js b/src/classes/PanelData.js index d5e9cf80e..c0f50576e 100644 --- a/src/classes/PanelData.js +++ b/src/classes/PanelData.js @@ -81,7 +81,7 @@ class PanelData { const { url } = tab; this._activeTab = tab; - this._activeTab.pageHost = url && processUrl(url).host || ''; + this._activeTab.pageHost = url && processUrl(url).hostname || ''; this._attachListeners(); @@ -191,7 +191,7 @@ class PanelData { // Android panel only const { url } = tab; this._activeTab = tab; - this._activeTab.pageHost = url && processUrl(url).host || ''; + this._activeTab.pageHost = url && processUrl(url).hostname || ''; this._setTrackerListAndCategories(); switch (view) { case 'panel': diff --git a/src/classes/Policy.js b/src/classes/Policy.js index 5861d19d8..e982e93f7 100644 --- a/src/classes/Policy.js +++ b/src/classes/Policy.js @@ -63,7 +63,7 @@ class Policy { */ whitelisted(url) { if (url) { - url = processUrl(url).host; + url = processUrl(url).hostname; url = url.replace(/^www\./, ''); const sites = conf.site_whitelist || []; const num_sites = sites.length; @@ -87,7 +87,7 @@ class Policy { */ blacklisted(url) { if (url) { - url = processUrl(url).host; + url = processUrl(url).hostname; url = url.replace(/^www\./, ''); const sites = conf.site_blacklist || []; const num_sites = sites.length; diff --git a/src/classes/TabInfo.js b/src/classes/TabInfo.js index 3d7aaea01..c7a430383 100644 --- a/src/classes/TabInfo.js +++ b/src/classes/TabInfo.js @@ -177,10 +177,10 @@ class TabInfo { _updateUrl(tab_id, tab_url) { const parsed = processUrl(tab_url); this._tabInfo[tab_id].url = tab_url; - this._tabInfo[tab_id].protocol = parsed.protocol; - this._tabInfo[tab_id].host = parsed.host; - this._tabInfo[tab_id].path = parsed.path; - this._tabInfo[tab_id].hash = parsed.anchor; + this._tabInfo[tab_id].protocol = parsed.scheme; + this._tabInfo[tab_id].host = parsed.hostname; + this._tabInfo[tab_id].path = parsed.pathname; + this._tabInfo[tab_id].hash = parsed.hash; this._tabInfo[tab_id].partialScan = false; } } diff --git a/src/utils/click2play.js b/src/utils/click2play.js index 5b2952522..839aa0f0e 100644 --- a/src/utils/click2play.js +++ b/src/utils/click2play.js @@ -115,8 +115,8 @@ export function buildC2P(details, app_id) { * @return {string} url of the internal template of the blocked redirect page */ export function buildRedirectC2P(requestId, redirectUrls, app_id) { - const host_url = processUrl(redirectUrls.url).host; - const redirect_url = processUrl(redirectUrls.redirectUrl).host; + const host_url = processUrl(redirectUrls.url).hostname; + const redirect_url = processUrl(redirectUrls.redirectUrl).hostname; const app_name = bugDb.db.apps[app_id].name; globals.BLOCKED_REDIRECT_DATA = {}; diff --git a/src/utils/matcher.js b/src/utils/matcher.js index 3cf5a2660..e456c10f9 100644 --- a/src/utils/matcher.js +++ b/src/utils/matcher.js @@ -36,13 +36,13 @@ export function isBug(src, tab_url) { found = // pattern classification 2: check host+path hash - _matchesHost(db.patterns.host_path, processedSrc.host, processedSrc.path) || + _matchesHost(db.patterns.host_path, processedSrc.hostname, processedSrc.pathname) || // class 1: check host hash _matchesHost(db.patterns.host, processedSrc.host) || // class 3: check path hash _matchesPath(processedSrc.path) || // class 4: check regex patterns - _matchesRegex(processedSrc.host_with_path); + _matchesRegex(processedSrc.host + processedSrc.pathname); if (typeof tab_url !== 'undefined') { // check firstPartyExceptions @@ -69,9 +69,9 @@ export function isBug(src, tab_url) { */ export function fuzzyUrlMatcher(url, urls) { const parsed = processUrl(url.toLowerCase()); - let tab_host = parsed.host; + let tab_host = parsed.hostname; - const tab_path = parsed.path; + const tab_path = parsed.pathname; if (tab_host.startsWith('www.')) { tab_host = tab_host.slice(4); diff --git a/src/utils/utils.js b/src/utils/utils.js index 8397281fa..fd3c984a8 100644 --- a/src/utils/utils.js +++ b/src/utils/utils.js @@ -165,25 +165,18 @@ export function processFpeUrl(src) { * @memberOf BackgroundUtils * * @param {string} src the source url - * @return {Object} contains url parts as properties + * @return {URL} contains url parts as properties + * */ export function processUrl(src) { try { const res = new URL(src); - return { - protocol: res.protocol ? res.protocol.substr(0, res.protocol.length - 1) : '', - host: res.hostname || '', - path: res.pathname ? res.pathname.substr(1) : '', - host_with_path: (res.host || '') + (res.pathname || ''), - anchor: res.hash ? res.hash.substr(1) : '', - }; + return res; } catch (e) { return { protocol: '', - host: '', - path: '', - host_with_path: '', - anchor: '', + hostname: '', + pathname: '', }; } } From a64f333dfc6392edc6978f650c793bf2c0ae12b0 Mon Sep 17 00:00:00 2001 From: Sam Macbeth Date: Mon, 8 Jul 2019 09:34:54 +0200 Subject: [PATCH 3/3] Fix matcher tests. --- src/utils/matcher.js | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/utils/matcher.js b/src/utils/matcher.js index e456c10f9..796802507 100644 --- a/src/utils/matcher.js +++ b/src/utils/matcher.js @@ -34,13 +34,15 @@ export function isBug(src, tab_url) { const processedSrc = processUrl(src.toLowerCase()); let found = false; + const path = processedSrc.pathname ? processedSrc.pathname.substring(1) : ''; + found = // pattern classification 2: check host+path hash - _matchesHost(db.patterns.host_path, processedSrc.hostname, processedSrc.pathname) || + _matchesHost(db.patterns.host_path, processedSrc.hostname, path) || // class 1: check host hash - _matchesHost(db.patterns.host, processedSrc.host) || + _matchesHost(db.patterns.host, processedSrc.hostname) || // class 3: check path hash - _matchesPath(processedSrc.path) || + _matchesPath(path) || // class 4: check regex patterns _matchesRegex(processedSrc.host + processedSrc.pathname); @@ -71,7 +73,7 @@ export function fuzzyUrlMatcher(url, urls) { const parsed = processUrl(url.toLowerCase()); let tab_host = parsed.hostname; - const tab_path = parsed.pathname; + const tab_path = parsed.pathname ? parsed.pathname.substring(1) : ''; if (tab_host.startsWith('www.')) { tab_host = tab_host.slice(4);