Refactor: rework reject hosts parser

This commit is contained in:
SukkaW 2023-11-03 14:12:09 +08:00
parent d80ea1ecd8
commit 065eeff91f
5 changed files with 148 additions and 129 deletions

View File

@ -20,6 +20,9 @@ const warnOnce = (url, isWhite, ...message) => {
console.warn(url, isWhite ? '(white)' : '(black)', ...message);
};
/**
* @param {string} domain
*/
const normalizeDomain = (domain) => {
if (!domain) return null;
@ -28,7 +31,10 @@ const normalizeDomain = (domain) => {
if (parsed.isIcann || parsed.isPrivate) {
const h = parsed.hostname;
return h?.[0] === '.' ? h.slice(1) : h;
if (h === null) return null;
return h[0] === '.' ? h.slice(1) : h;
}
return null;
@ -259,6 +265,7 @@ function parse($line, gorhill) {
const line = $line.trim();
/** @example line.length */
const len = line.length;
if (len === 0) {
return null;
@ -268,15 +275,13 @@ function parse($line, gorhill) {
const lastChar = line[len - 1];
if (
len === 0
|| firstChar === '/'
firstChar === '/'
// ends with
|| lastChar === '.' // || line.endsWith('.')
|| lastChar === '-' // || line.endsWith('-')
|| lastChar === '_' // || line.endsWith('_')
// special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
|| ((line.includes('/') || line.includes(':')) && !line.includes('://'))
// || line.includes('$popup')
// || line.includes('$removeparam')
// || line.includes('$popunder')
@ -284,6 +289,10 @@ function parse($line, gorhill) {
return null;
}
if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
return null;
}
const filter = NetworkFilter.parse(line);
if (filter) {
if (
@ -352,9 +361,11 @@ function parse($line, gorhill) {
return null;
}
/** @example line.endsWith('^') */
const linedEndsWithCaret = lastChar === '^';
/** @example line.endsWith('^|') */
const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
/** @example line.endsWith('^') || line.endsWith('^|') */
const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
// whitelist (exception)
@ -379,8 +390,19 @@ function parse($line, gorhill) {
* `@@|adsterra.com^|`
*/
if (
// (line.startsWith('@@|') || line.startsWith('@@.'))
(line[2] === '|' || line[2] === '.')
(
// line.startsWith('@@|')
line[2] === '|'
// line.startsWith('@@.')
|| line[2] === '.'
/**
* line.startsWith('@@://')
*
* `@@://googleadservices.com^|`
* `@@://www.googleadservices.com^|`
*/
|| (line[2] === ':' && line[3] === '/' && line[4] === '/')
)
&& (
lineEndsWithCaretOrCaretVerticalBar
|| line.endsWith('$genericblock')
@ -389,6 +411,7 @@ function parse($line, gorhill) {
) {
const _domain = line
.replace('@@||', '')
.replace('@@://', '')
.replace('@@|', '')
.replace('@@.', '')
.replace('^|', '')
@ -409,13 +432,17 @@ function parse($line, gorhill) {
}
}
if (firstChar === '|' && (lineEndsWithCaretOrCaretVerticalBar || line.endsWith('$cname'))) {
if (firstChar === '|') {
const lineEndswithCname = line.endsWith('$cname');
if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
/**
* Some malformed filters can not be parsed by NetworkFilter:
*
* `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
* `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
*/
const includeAllSubDomain = line[1] === '|';
@ -426,11 +453,10 @@ function parse($line, gorhill) {
: lineEndsWithCaretOrCaretVerticalBar
? -2
// eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
: (line.endsWith('$cname') ? -6 : 0);
: (lineEndswithCname ? -6 : 0);
const _domain = line
// .replace('||', '')
.slice(sliceStart, sliceEnd) // we already make sure line startsWith ||
.slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
.trim();
const domain = normalizeDomain(_domain);
@ -441,6 +467,7 @@ function parse($line, gorhill) {
return null;
}
}
const lineStartsWithSingleDot = firstChar === '.';
if (
@ -452,16 +479,12 @@ function parse($line, gorhill) {
* `.m.bookben.com^`
* `.wap.x4399.com^`
*/
const _domain = line
.slice(
1,
linedEndsWithCaret
const _domain = line.slice(
1, // remove prefix dot
linedEndsWithCaret // replaceAll('^', '')
? -1
: (lineEndsWithCaretVerticalBar ? -2 : 0)
) // remove prefix dot
.replace('^|', '')
.replaceAll('^', '')
.trim();
: (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
);
const suffix = gorhill.getPublicSuffix(_domain);
if (!gorhill.suffixInPSL(suffix)) {
@ -518,6 +541,7 @@ function parse($line, gorhill) {
* `charlestownwyllie.oaklawnnonantum.com^`
* `-telemetry.officeapps.live.com^`
* `-tracker.biliapi.net`
* `-logging.nextmedia.com`
* `_social_tracking.js^`
*/
if (firstChar !== '|' && lastChar === '^') {
@ -538,35 +562,48 @@ function parse($line, gorhill) {
return null;
}
if (lineStartsWithSingleDot) {
/**
* `.cookielaw.js`
* `.content_tracking.js`
* `.ads.css`
*/
const _domain = line.slice(1);
const suffix = gorhill.getPublicSuffix(_domain);
if (!suffix || !gorhill.suffixInPSL(suffix)) {
// This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
return null;
}
const tryNormalizeDomain = normalizeDomain(_domain);
if (tryNormalizeDomain === _domain) {
// the entire rule is domain
return [line, 2];
}
} else {
/**
* `.3.n.2.2.l30.js`
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `://www.smfg-card.$document`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `@@://googleadservices.com^|`
* `_200x250.png`
* `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
*/
const tryNormalizeDomain = normalizeDomain(line);
if (tryNormalizeDomain) {
if (tryNormalizeDomain === line) {
// the entire rule is domain
return [line, 2];
}
if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
// dot prefixed line has stripped
return [line, 2];
}
}
if (!line.endsWith('.js') && !line.endsWith('.css')) {
console.warn(' * [parse-filter E0010] can not parse:', line);
}
return null;
/* eslint-enable no-nested-ternary */
}
module.exports.processDomainLists = processDomainLists;

View File

@ -27,7 +27,9 @@ const ADGUARD_FILTERS = /** @type {const} */([
[
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt'
'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
'https://ublockorigin.github.io/uAssets/thirdparties/easyprivacy.txt',
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
]
],
// AdGuard DNS Filter
@ -45,70 +47,42 @@ const ADGUARD_FILTERS = /** @type {const} */([
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
// uBlock Origin Filter List
[
'https://ublockorigin.github.io/uAssets/filters/filters.txt',
'https://ublockorigin.github.io/uAssets/filters/filters.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt',
'https://ublockorigin.pages.dev/filters/filters.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt',
'https://ublockorigin.pages.dev/filters/filters-2020.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt',
'https://ublockorigin.pages.dev/filters/filters-2021.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt',
'https://ublockorigin.pages.dev/filters/filters-2022.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2023.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2023.txt',
'https://ublockorigin.pages.dev/filters/filters-2023.txt'
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
'https://ublockorigin.pages.dev/filters/filters.min.txt'
]
],
// uBlock Origin Badware Risk List
[
'https://ublockorigin.github.io/uAssets/filters/badware.txt',
'https://ublockorigin.github.io/uAssets/filters/badware.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt',
'https://ublockorigin.pages.dev/filters/badware.txt'
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.min.txt',
'https://ublockorigin.pages.dev/filters/badware.min.txt'
]
],
// uBlock Origin Privacy List
[
'https://ublockorigin.github.io/uAssets/filters/privacy.txt',
'https://ublockorigin.github.io/uAssets/filters/privacy.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt',
'https://ublockorigin.pages.dev/filters/privacy.txt'
]
],
// uBlock Origin Resource Abuse
[
'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.min.txt',
'https://ublockorigin.pages.dev/filters/privacy.min.txt'
]
],
// uBlock Origin Resource Abuse: merged in uBlock Origin Privacy List
// [
// 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
// [
// 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
// 'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
// ]
// ],
// uBlock Origin Unbreak
[
'https://ublockorigin.github.io/uAssets/filters/unbreak.txt',
'https://ublockorigin.github.io/uAssets/filters/unbreak.min.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt',
'https://ublockorigin.pages.dev/filters/unbreak.txt'
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
]
],
// AdGuard Base Filter
@ -171,9 +145,7 @@ const ADGUARD_FILTERS = /** @type {const} */([
// BarbBlock
'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt',
// Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt',
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty-cname.txt',
'https://raw.githubusercontent.com/brave/adblock-lists/master/coin-miners.txt'
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
]);
const PREDEFINED_WHITELIST = [

View File

@ -340,6 +340,8 @@ th.bing.com
images.ecomm.microsoft.com
.yammerusercontent.com
.assets-yammer.com
.edgestorewebcdn.azureedge.net
img-s-msn-com.akamaized.net
# Microsoft Cookie Conscent
wcpstatic.microsoft.com
# Xbox

View File

@ -519,6 +519,11 @@ www.kuguopush.com
.adcast.deviantart.com
.inside.bitcomet.com
.cdnads.com
.stats.esomniture.com
.adalyser.com
.tradedoubler.com
.xiti.com
.cjt1.net
.youxiaoad.com
.iteye.com
@ -1749,7 +1754,6 @@ ntp.msn.cn
assets.msn.cn
api.msn.com
browser.events.data.msn.com
img-s-msn-com.akamaized.net
# >> OPPO
adsfs.oppomobile.com
@ -1820,6 +1824,7 @@ adserve2.tom.com
.discovery.tom.com
# brightdata (luminati) SDK
.l-err.biz
.lum-sdk.io
.luminatinet.com
.luminati.io
@ -1829,3 +1834,4 @@ adserve2.tom.com
.hola.org
.h-vpn.org
.holashop.org
.svd-cdn.com

View File

@ -83,7 +83,7 @@ DOMAIN-SUFFIX,tw1.ru
# >> General
DOMAIN-KEYWORD,track.tiara
DOMAIN-KEYWORD,adservice
# DOMAIN-KEYWORD,adservice # conflict with @@://www.googleadservices.com^|
DOMAIN-KEYWORD,umeng
DOMAIN-KEYWORD,adsby
DOMAIN-KEYWORD,adsdk
@ -108,6 +108,8 @@ DOMAIN-KEYWORD,_vmind.qqvideo.tc.qq.com
DOMAIN-KEYWORD,-logging.nextmedia.com
DOMAIN-KEYWORD,-spiky.clevertap-prod.com
DOMAIN-KEYWORD,.engage.3m.
# -telemetry.officeapps.live.com.mcas.ms
# -telemetry.officeapps.live.com
DOMAIN-KEYWORD,telemetry.officeapps.live.com
DOMAIN-KEYWORD,-launches.appsflyersdk.com