Refactor: rework reject hosts parser

This commit is contained in:
SukkaW 2023-11-03 14:12:09 +08:00
parent d80ea1ecd8
commit 065eeff91f
5 changed files with 148 additions and 129 deletions

View File

@ -20,6 +20,9 @@ const warnOnce = (url, isWhite, ...message) => {
console.warn(url, isWhite ? '(white)' : '(black)', ...message); console.warn(url, isWhite ? '(white)' : '(black)', ...message);
}; };
/**
* @param {string} domain
*/
const normalizeDomain = (domain) => { const normalizeDomain = (domain) => {
if (!domain) return null; if (!domain) return null;
@ -28,7 +31,10 @@ const normalizeDomain = (domain) => {
if (parsed.isIcann || parsed.isPrivate) { if (parsed.isIcann || parsed.isPrivate) {
const h = parsed.hostname; const h = parsed.hostname;
return h?.[0] === '.' ? h.slice(1) : h;
if (h === null) return null;
return h[0] === '.' ? h.slice(1) : h;
} }
return null; return null;
@ -259,6 +265,7 @@ function parse($line, gorhill) {
const line = $line.trim(); const line = $line.trim();
/** @example line.length */
const len = line.length; const len = line.length;
if (len === 0) { if (len === 0) {
return null; return null;
@ -268,15 +275,13 @@ function parse($line, gorhill) {
const lastChar = line[len - 1]; const lastChar = line[len - 1];
if ( if (
len === 0 firstChar === '/'
|| firstChar === '/'
// ends with // ends with
|| lastChar === '.' // || line.endsWith('.') || lastChar === '.' // || line.endsWith('.')
|| lastChar === '-' // || line.endsWith('-') || lastChar === '-' // || line.endsWith('-')
|| lastChar === '_' // || line.endsWith('_') || lastChar === '_' // || line.endsWith('_')
// special modifier // special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line) || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
|| ((line.includes('/') || line.includes(':')) && !line.includes('://'))
// || line.includes('$popup') // || line.includes('$popup')
// || line.includes('$removeparam') // || line.includes('$removeparam')
// || line.includes('$popunder') // || line.includes('$popunder')
@ -284,6 +289,10 @@ function parse($line, gorhill) {
return null; return null;
} }
if ((line.includes('/') || line.includes(':')) && !line.includes('://')) {
return null;
}
const filter = NetworkFilter.parse(line); const filter = NetworkFilter.parse(line);
if (filter) { if (filter) {
if ( if (
@ -352,9 +361,11 @@ function parse($line, gorhill) {
return null; return null;
} }
/** @example line.endsWith('^') */
const linedEndsWithCaret = lastChar === '^'; const linedEndsWithCaret = lastChar === '^';
/** @example line.endsWith('^|') */
const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^'; const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
/** @example line.endsWith('^') || line.endsWith('^|') */
const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar; const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
// whitelist (exception) // whitelist (exception)
@ -379,8 +390,19 @@ function parse($line, gorhill) {
* `@@|adsterra.com^|` * `@@|adsterra.com^|`
*/ */
if ( if (
// (line.startsWith('@@|') || line.startsWith('@@.')) (
(line[2] === '|' || line[2] === '.') // line.startsWith('@@|')
line[2] === '|'
// line.startsWith('@@.')
|| line[2] === '.'
/**
* line.startsWith('@@://')
*
* `@@://googleadservices.com^|`
* `@@://www.googleadservices.com^|`
*/
|| (line[2] === ':' && line[3] === '/' && line[4] === '/')
)
&& ( && (
lineEndsWithCaretOrCaretVerticalBar lineEndsWithCaretOrCaretVerticalBar
|| line.endsWith('$genericblock') || line.endsWith('$genericblock')
@ -389,6 +411,7 @@ function parse($line, gorhill) {
) { ) {
const _domain = line const _domain = line
.replace('@@||', '') .replace('@@||', '')
.replace('@@://', '')
.replace('@@|', '') .replace('@@|', '')
.replace('@@.', '') .replace('@@.', '')
.replace('^|', '') .replace('^|', '')
@ -409,37 +432,41 @@ function parse($line, gorhill) {
} }
} }
if (firstChar === '|' && (lineEndsWithCaretOrCaretVerticalBar || line.endsWith('$cname'))) { if (firstChar === '|') {
/** const lineEndswithCname = line.endsWith('$cname');
* Some malformed filters can not be parsed by NetworkFilter:
*
* `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
*/
const includeAllSubDomain = line[1] === '|'; if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
/**
* Some malformed filters can not be parsed by NetworkFilter:
*
* `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
* `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
*/
const sliceStart = includeAllSubDomain ? 2 : 1; const includeAllSubDomain = line[1] === '|';
const sliceEnd = lastChar === '^'
? -1
: lineEndsWithCaretOrCaretVerticalBar
? -2
// eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
: (line.endsWith('$cname') ? -6 : 0);
const _domain = line const sliceStart = includeAllSubDomain ? 2 : 1;
// .replace('||', '') const sliceEnd = lastChar === '^'
.slice(sliceStart, sliceEnd) // we already make sure line startsWith || ? -1
.trim(); : lineEndsWithCaretOrCaretVerticalBar
? -2
// eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
: (lineEndswithCname ? -6 : 0);
const domain = normalizeDomain(_domain); const _domain = line
if (domain) { .slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
return [domain, includeAllSubDomain ? 2 : 1]; .trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, includeAllSubDomain ? 2 : 1];
}
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
return null;
} }
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
return null;
} }
const lineStartsWithSingleDot = firstChar === '.'; const lineStartsWithSingleDot = firstChar === '.';
@ -452,16 +479,12 @@ function parse($line, gorhill) {
* `.m.bookben.com^` * `.m.bookben.com^`
* `.wap.x4399.com^` * `.wap.x4399.com^`
*/ */
const _domain = line const _domain = line.slice(
.slice( 1, // remove prefix dot
1, linedEndsWithCaret // replaceAll('^', '')
linedEndsWithCaret ? -1
? -1 : (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '')
: (lineEndsWithCaretVerticalBar ? -2 : 0) );
) // remove prefix dot
.replace('^|', '')
.replaceAll('^', '')
.trim();
const suffix = gorhill.getPublicSuffix(_domain); const suffix = gorhill.getPublicSuffix(_domain);
if (!gorhill.suffixInPSL(suffix)) { if (!gorhill.suffixInPSL(suffix)) {
@ -479,10 +502,10 @@ function parse($line, gorhill) {
} }
/** /**
* `|http://x.o2.pl^` * `|http://x.o2.pl^`
* `://mine.torrent.pw^` * `://mine.torrent.pw^`
* `://say.ac^` * `://say.ac^`
*/ */
if ( if (
( (
line.startsWith('://') line.startsWith('://')
@ -513,13 +536,14 @@ function parse($line, gorhill) {
} }
/** /**
* `_vmind.qqvideo.tc.qq.com^` * `_vmind.qqvideo.tc.qq.com^`
* `arketing.indianadunes.com^` * `arketing.indianadunes.com^`
* `charlestownwyllie.oaklawnnonantum.com^` * `charlestownwyllie.oaklawnnonantum.com^`
* `-telemetry.officeapps.live.com^` * `-telemetry.officeapps.live.com^`
* `-tracker.biliapi.net` * `-tracker.biliapi.net`
* `_social_tracking.js^` * `-logging.nextmedia.com`
*/ * `_social_tracking.js^`
*/
if (firstChar !== '|' && lastChar === '^') { if (firstChar !== '|' && lastChar === '^') {
const _domain = line.slice(0, -1); const _domain = line.slice(0, -1);
@ -538,35 +562,48 @@ function parse($line, gorhill) {
return null; return null;
} }
/** if (lineStartsWithSingleDot) {
* `.3.n.2.2.l30.js` /**
* `_prebid.js` * `.cookielaw.js`
* `t.yesware.com` * `.content_tracking.js`
* `ubmcmm.baidustatic.com` * `.ads.css`
* `portal.librus.pl$$advertisement-module` */
* `@@-ds.metric.gstatic.com^|` const _domain = line.slice(1);
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document` const suffix = gorhill.getPublicSuffix(_domain);
* `@@://googleadservices.com^|` if (!suffix || !gorhill.suffixInPSL(suffix)) {
*/ // This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
const tryNormalizeDomain = normalizeDomain(line); return null;
if (tryNormalizeDomain) { }
const tryNormalizeDomain = normalizeDomain(_domain);
if (tryNormalizeDomain === _domain) {
// the entire rule is domain
return [line, 2];
}
} else {
/**
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `://www.smfg-card.$document`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `_200x250.png`
* `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
*/
const tryNormalizeDomain = normalizeDomain(line);
if (tryNormalizeDomain === line) { if (tryNormalizeDomain === line) {
// the entire rule is domain // the entire rule is domain
return [line, 2]; return [line, 2];
} }
if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
// dot prefixed line has stripped
return [line, 2];
}
} }
if (!line.endsWith('.js') && !line.endsWith('.css')) { console.warn(' * [parse-filter E0010] can not parse:', line);
console.warn(' * [parse-filter E0010] can not parse:', line);
}
return null; return null;
/* eslint-enable no-nested-ternary */
} }
module.exports.processDomainLists = processDomainLists; module.exports.processDomainLists = processDomainLists;

View File

@ -27,7 +27,9 @@ const ADGUARD_FILTERS = /** @type {const} */([
[ [
'https://secure.fanboy.co.nz/easyprivacy.txt', 'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt', 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt' 'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
'https://ublockorigin.github.io/uAssets/thirdparties/easyprivacy.txt',
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
] ]
], ],
// AdGuard DNS Filter // AdGuard DNS Filter
@ -45,70 +47,42 @@ const ADGUARD_FILTERS = /** @type {const} */([
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt', 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
// uBlock Origin Filter List // uBlock Origin Filter List
[ [
'https://ublockorigin.github.io/uAssets/filters/filters.txt', 'https://ublockorigin.github.io/uAssets/filters/filters.min.txt',
[ [
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt', 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',
'https://ublockorigin.pages.dev/filters/filters.txt' 'https://ublockorigin.pages.dev/filters/filters.min.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt',
'https://ublockorigin.pages.dev/filters/filters-2020.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt',
'https://ublockorigin.pages.dev/filters/filters-2021.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt',
'https://ublockorigin.pages.dev/filters/filters-2022.txt'
]
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2023.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2023.txt',
'https://ublockorigin.pages.dev/filters/filters-2023.txt'
] ]
], ],
// uBlock Origin Badware Risk List // uBlock Origin Badware Risk List
[ [
'https://ublockorigin.github.io/uAssets/filters/badware.txt', 'https://ublockorigin.github.io/uAssets/filters/badware.min.txt',
[ [
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt', 'https://ublockorigin.github.io/uAssetsCDN/filters/badware.min.txt',
'https://ublockorigin.pages.dev/filters/badware.txt' 'https://ublockorigin.pages.dev/filters/badware.min.txt'
] ]
], ],
// uBlock Origin Privacy List // uBlock Origin Privacy List
[ [
'https://ublockorigin.github.io/uAssets/filters/privacy.txt', 'https://ublockorigin.github.io/uAssets/filters/privacy.min.txt',
[ [
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt', 'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.min.txt',
'https://ublockorigin.pages.dev/filters/privacy.txt' 'https://ublockorigin.pages.dev/filters/privacy.min.txt'
]
],
// uBlock Origin Resource Abuse
[
'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
] ]
], ],
// uBlock Origin Resource Abuse: merged in uBlock Origin Privacy List
// [
// 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt',
// [
// 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
// 'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
// ]
// ],
// uBlock Origin Unbreak // uBlock Origin Unbreak
[ [
'https://ublockorigin.github.io/uAssets/filters/unbreak.txt', 'https://ublockorigin.github.io/uAssets/filters/unbreak.min.txt',
[ [
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt', 'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt',
'https://ublockorigin.pages.dev/filters/unbreak.txt' 'https://ublockorigin.pages.dev/filters/unbreak.min.txt'
] ]
], ],
// AdGuard Base Filter // AdGuard Base Filter
@ -171,9 +145,7 @@ const ADGUARD_FILTERS = /** @type {const} */([
// BarbBlock // BarbBlock
'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt', 'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt',
// Brave First Party & First Party CNAME // Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', 'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty-cname.txt',
'https://raw.githubusercontent.com/brave/adblock-lists/master/coin-miners.txt'
]); ]);
const PREDEFINED_WHITELIST = [ const PREDEFINED_WHITELIST = [

View File

@ -340,6 +340,8 @@ th.bing.com
images.ecomm.microsoft.com images.ecomm.microsoft.com
.yammerusercontent.com .yammerusercontent.com
.assets-yammer.com .assets-yammer.com
.edgestorewebcdn.azureedge.net
img-s-msn-com.akamaized.net
# Microsoft Cookie Conscent # Microsoft Cookie Conscent
wcpstatic.microsoft.com wcpstatic.microsoft.com
# Xbox # Xbox

View File

@ -519,6 +519,11 @@ www.kuguopush.com
.adcast.deviantart.com .adcast.deviantart.com
.inside.bitcomet.com .inside.bitcomet.com
.cdnads.com .cdnads.com
.stats.esomniture.com
.adalyser.com
.tradedoubler.com
.xiti.com
.cjt1.net
.youxiaoad.com .youxiaoad.com
.iteye.com .iteye.com
@ -1749,7 +1754,6 @@ ntp.msn.cn
assets.msn.cn assets.msn.cn
api.msn.com api.msn.com
browser.events.data.msn.com browser.events.data.msn.com
img-s-msn-com.akamaized.net
# >> OPPO # >> OPPO
adsfs.oppomobile.com adsfs.oppomobile.com
@ -1820,6 +1824,7 @@ adserve2.tom.com
.discovery.tom.com .discovery.tom.com
# brightdata (luminati) SDK # brightdata (luminati) SDK
.l-err.biz
.lum-sdk.io .lum-sdk.io
.luminatinet.com .luminatinet.com
.luminati.io .luminati.io
@ -1829,3 +1834,4 @@ adserve2.tom.com
.hola.org .hola.org
.h-vpn.org .h-vpn.org
.holashop.org .holashop.org
.svd-cdn.com

View File

@ -83,7 +83,7 @@ DOMAIN-SUFFIX,tw1.ru
# >> General # >> General
DOMAIN-KEYWORD,track.tiara DOMAIN-KEYWORD,track.tiara
DOMAIN-KEYWORD,adservice # DOMAIN-KEYWORD,adservice # conflict with @@://www.googleadservices.com^|
DOMAIN-KEYWORD,umeng DOMAIN-KEYWORD,umeng
DOMAIN-KEYWORD,adsby DOMAIN-KEYWORD,adsby
DOMAIN-KEYWORD,adsdk DOMAIN-KEYWORD,adsdk
@ -108,6 +108,8 @@ DOMAIN-KEYWORD,_vmind.qqvideo.tc.qq.com
DOMAIN-KEYWORD,-logging.nextmedia.com DOMAIN-KEYWORD,-logging.nextmedia.com
DOMAIN-KEYWORD,-spiky.clevertap-prod.com DOMAIN-KEYWORD,-spiky.clevertap-prod.com
DOMAIN-KEYWORD,.engage.3m. DOMAIN-KEYWORD,.engage.3m.
# -telemetry.officeapps.live.com.mcas.ms
# -telemetry.officeapps.live.com
DOMAIN-KEYWORD,telemetry.officeapps.live.com DOMAIN-KEYWORD,telemetry.officeapps.live.com
DOMAIN-KEYWORD,-launches.appsflyersdk.com DOMAIN-KEYWORD,-launches.appsflyersdk.com