Perf: make reject parsing faster & more robust

This commit is contained in:
SukkaW 2023-12-17 23:48:43 +08:00
parent e2f14d93b8
commit 91df00f7f3

View File

@ -5,7 +5,7 @@ import { processLine } from './process-line';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import type { PublicSuffixList } from '@gorhill/publicsuffixlist'; import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
import { traceAsync } from './trace-runner'; import { traceAsync, traceSync } from './trace-runner';
import picocolors from 'picocolors'; import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain'; import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets'; import { fetchAssets } from './fetch-assets';
@ -156,9 +156,13 @@ export async function processFilterRules(
() => fetchAssets(filterRulesUrl, fallbackUrls), () => fetchAssets(filterRulesUrl, fallbackUrls),
picocolors.gray picocolors.gray
)).split('\n'); )).split('\n');
const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
console.time(key);
for (let i = 0, len = filterRules.length; i < len; i++) { for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]); lineCb(filterRules[i]);
} }
console.timeEnd(key);
} }
}); });
@ -305,17 +309,19 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
} }
/** @example line.endsWith('^') */ /** @example line.endsWith('^') */
const linedEndsWithCaret = lastCharCode === 94; // lastChar === '^'; const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
/** @example line.endsWith('^|') */ /** @example line.endsWith('^|') */
const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^'; const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
/** @example line.endsWith('^') || line.endsWith('^|') */ /** @example line.endsWith('^') || line.endsWith('^|') */
const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar; const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
// whitelist (exception) // whitelist (exception)
if ( if (
firstCharCode === 64 // 64 `@` firstCharCode === 64 // 64 `@`
&& line[1] === '@' && line[1] === '@'
) { ) {
let whiteIncludeAllSubDomain = true;
/** /**
* Some "malformed" regex-based filters can not be parsed by NetworkFilter * Some "malformed" regex-based filters can not be parsed by NetworkFilter
* "$genericblock`" is also not supported by NetworkFilter, see: * "$genericblock`" is also not supported by NetworkFilter, see:
@ -331,22 +337,27 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
let sliceStart = 0; let sliceStart = 0;
let sliceEnd: number | undefined; let sliceEnd: number | undefined;
// line.startsWith('@@|') || line.startsWith('@@.') if (line[2] === '|') { // line.startsWith('@@|')
if (line[2] === '|' || line[2] === '.') {
sliceStart = 3; sliceStart = 3;
// line.startsWith('@@||') whiteIncludeAllSubDomain = false;
if (line[3] === '|') {
sliceStart = 4;
}
}
/** if (line[3] === '|') { // line.startsWith('@@||')
* line.startsWith('@@://') sliceStart = 4;
* whiteIncludeAllSubDomain = true;
* `@@://googleadservices.com^|` }
* `@@://www.googleadservices.com^|` } else if (line[2] === '.') { // line.startsWith('@@.')
*/ sliceStart = 3;
if (line[2] === ':' && line[3] === '/' && line[4] === '/') { whiteIncludeAllSubDomain = true;
} else if (
/**
* line.startsWith('@@://')
*
* `@@://googleadservices.com^|`
* `@@://www.googleadservices.com^|`
*/
line[2] === ':' && line[3] === '/' && line[4] === '/'
) {
whiteIncludeAllSubDomain = false;
sliceStart = 5; sliceStart = 5;
} }
@ -368,7 +379,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
const sliced = line.slice(sliceStart, sliceEnd); const sliced = line.slice(sliceStart, sliceEnd);
const domain = normalizeDomain(sliced); const domain = normalizeDomain(sliced);
if (domain) { if (domain) {
return [domain, ParseType.WhiteIncludeSubdomain]; return [domain, whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute];
} }
return [ return [
`[parse-filter E0001] (white) invalid domain: ${JSON.stringify({ `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
@ -386,40 +397,39 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
]; ];
} }
if (firstCharCode === 124) { // 124 `|` if (
if (lineEndsWithCaretOrCaretVerticalBar) { // 124 `|`
/** // line.startsWith('|')
* Some malformed filters can not be parsed by NetworkFilter: firstCharCode === 124
* && lineEndsWithCaretOrCaretVerticalBar
* `||smetrics.teambeachbody.com^.com^` ) {
* `||solutions.|pages.indigovision.com^` /**
* `||vystar..0rg@client.iebetanialaargentina.edu.co^` * Some malformed filters can not be parsed by NetworkFilter:
* `app-uat.latrobehealth.com.au^predirect.snapdeal.com` *
*/ * `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
* `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
*/
const includeAllSubDomain = line[1] === '|'; const includeAllSubDomain = line[1] === '|';
const sliceStart = includeAllSubDomain ? 2 : 1; const sliceStart = includeAllSubDomain ? 2 : 1;
const sliceEnd = lastCharCode === 94 // lastChar === '^' const sliceEnd = lineEndsWithCaret
? -1 ? -1
: (lineEndsWithCaretVerticalBar : (lineEndsWithCaretVerticalBar ? -2 : undefined);
? -2
: undefined);
const _domain = line const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
.slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
.trim();
const domain = normalizeDomain(_domain); const domain = normalizeDomain(sliced);
if (domain) { if (domain) {
return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute]; return [domain, includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute];
}
return [
`[parse-filter E0002] (black) invalid domain: ${_domain}`,
ParseType.ErrorMessage
];
} }
return [
`[parse-filter E0002] (black) invalid domain: ${sliced}`,
ParseType.ErrorMessage
];
} }
const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.` const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
@ -432,75 +442,78 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
* `.m.bookben.com^` * `.m.bookben.com^`
* `.wap.x4399.com^` * `.wap.x4399.com^`
*/ */
const _domain = line.slice( const sliced = line.slice(
1, // remove prefix dot 1, // remove prefix dot
linedEndsWithCaret // replaceAll('^', '') lineEndsWithCaret // replaceAll('^', '')
? -1 ? -1
: (lineEndsWithCaretVerticalBar ? -2 : 0) // replace('^|', '') : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
); );
const suffix = gorhill.getPublicSuffix(_domain); const suffix = gorhill.getPublicSuffix(sliced);
if (!gorhill.suffixInPSL(suffix)) { if (!gorhill.suffixInPSL(suffix)) {
// This exclude domain-like resource like `1.1.4.514.js` // This exclude domain-like resource like `1.1.4.514.js`
return null; return null;
} }
const domain = normalizeDomain(_domain); const domain = normalizeDomain(sliced);
if (domain) { if (domain) {
return [domain, ParseType.BlackIncludeSubdomain]; return [domain, ParseType.BlackIncludeSubdomain];
} }
return [ return [
`[paparse-filter E0003] (black) invalid domain: ${_domain}`, `[paparse-filter E0003] (black) invalid domain: ${sliced}`,
ParseType.ErrorMessage ParseType.ErrorMessage
]; ];
} }
/** /**
* `|http://x.o2.pl^` * `|http://x.o2.pl^`
* `://mine.torrent.pw^` * `://mine.torrent.pw^`
* `://say.ac^` * `://say.ac^`
*/ */
if ( if (lineEndsWithCaretOrCaretVerticalBar) {
( let sliceStart = 0;
line.startsWith('://') let sliceEnd;
|| line.startsWith('http://') if (lineEndsWithCaret) { // line.endsWith('^')
|| line.startsWith('https://') sliceEnd = -1;
|| line.startsWith('|http://') } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
|| line.startsWith('|https://') sliceEnd = -2;
) }
&& lineEndsWithCaretOrCaretVerticalBar if (line.startsWith('://')) {
) { sliceStart = 3;
const _domain = line } else if (line.startsWith('http://')) {
.replace('|https://', '') sliceStart = 7;
.replace('https://', '') } else if (line.startsWith('https://')) {
.replace('|http://', '') sliceStart = 8;
.replace('http://', '') } else if (line.startsWith('|http://')) {
.replace('://', '') sliceStart = 8;
.replace('^|', '') } else if (line.startsWith('|https://')) {
.replaceAll('^', '') sliceStart = 9;
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, ParseType.BlackAbsolute];
} }
return [ if (sliceStart !== 0 || sliceEnd !== undefined) {
`[parse-filter E0004] (black) invalid domain: ${_domain}`, const sliced = line.slice(sliceStart, sliceEnd);
ParseType.ErrorMessage const domain = normalizeDomain(sliced);
]; if (domain) {
return [domain, ParseType.BlackIncludeSubdomain];
}
return [
`[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd
})}`,
ParseType.ErrorMessage
];
}
} }
/** /**
* `_vmind.qqvideo.tc.qq.com^` * `_vmind.qqvideo.tc.qq.com^`
* `arketing.indianadunes.com^` * `arketing.indianadunes.com^`
* `charlestownwyllie.oaklawnnonantum.com^` * `charlestownwyllie.oaklawnnonantum.com^`
* `-telemetry.officeapps.live.com^` * `-telemetry.officeapps.live.com^`
* `-tracker.biliapi.net` * `-tracker.biliapi.net`
* `-logging.nextmedia.com` * `-logging.nextmedia.com`
* `_social_tracking.js^` * `_social_tracking.js^`
*/ */
if ( if (
firstCharCode !== 124 // 124 `|` firstCharCode !== 124 // 124 `|`
&& lastCharCode === 94 // 94 `^` && lastCharCode === 94 // 94 `^`
@ -524,43 +537,62 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
]; ];
} }
// Possibly that entire rule is domain
/**
* lineStartsWithSingleDot:
*
* `.cookielaw.js`
* `.content_tracking.js`
* `.ads.css`
*
* else:
*
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `://www.smfg-card.$document`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `_200x250.png`
* `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
*/
let sliceStart = 0;
let sliceEnd: number | undefined;
if (lineStartsWithSingleDot) { if (lineStartsWithSingleDot) {
/** sliceStart = 1;
* `.cookielaw.js` }
* `.content_tracking.js` if (line.endsWith('^$all')) { // This salvage line `thepiratebay3.com^$all`
* `.ads.css` sliceEnd = -5;
*/ } else if (
const _domain = line.slice(1); // Try to salvage line like `://account.smba.$document`
// For this specific line, it will fail anyway though.
line.endsWith('$document')
) {
sliceEnd = -9;
}
const sliced = (sliceStart !== 0 || sliceEnd !== undefined) ? line.slice(sliceStart, sliceEnd) : line;
const suffix = gorhill.getPublicSuffix(sliced);
/**
* Fast exclude definitely not domain-like resource
*
* `.gatracking.js`, suffix is `js`,
* `.ads.css`, suffix is `css`,
* `-cpm-ads.$badfilter`, suffix is `$badfilter`,
* `portal.librus.pl$$advertisement-module`, suffix is `pl$$advertisement-module`
*/
if (!suffix || !gorhill.suffixInPSL(suffix)) {
// This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js`
console.log({ line, suffix });
return null;
}
const suffix = gorhill.getPublicSuffix(_domain); const tryNormalizeDomain = normalizeDomain(sliced);
if (!suffix || !gorhill.suffixInPSL(suffix)) { if (tryNormalizeDomain === sliced) {
// This exclude domain-like resource like `.gatracking.js`, `.beacon.min.js` and `.cookielaw.js` // the entire rule is domain
return null; return [sliced, ParseType.BlackIncludeSubdomain];
}
const tryNormalizeDomain = normalizeDomain(_domain);
if (tryNormalizeDomain === _domain) {
// the entire rule is domain
return [line, ParseType.BlackIncludeSubdomain];
}
} else {
/**
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `://www.smfg-card.$document`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `_200x250.png`
* `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
*/
const tryNormalizeDomain = normalizeDomain(line);
if (tryNormalizeDomain === line) {
// the entire rule is domain
return [line, ParseType.BlackIncludeSubdomain];
}
} }
return [ return [