Make AdGuard parse faster / Make reject stats correct

This commit is contained in:
SukkaW 2023-09-18 01:03:31 +08:00
parent 21cddea6f0
commit cbaa4d51f5
3 changed files with 201 additions and 122 deletions

View File

@ -69,10 +69,11 @@ const buildPhishingDomainSet = task(__filename, async () => {
processFilterRules( processFilterRules(
'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt', 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt',
[ [
'https://malware-filter.gitlab.io/phishing-filter/phishing-filter-agh.txt',
'https://malware-filter.pages.dev/phishing-filter-agh.txt',
'https://phishing-filter.pages.dev/phishing-filter-agh.txt' 'https://phishing-filter.pages.dev/phishing-filter-agh.txt'
] // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt'
],
false
), ),
getGorhillPublicSuffixPromise() getGorhillPublicSuffixPromise()
]); ]);

View File

@ -14,7 +14,7 @@ const { readFileByLine } = require('./lib/fetch-remote-text-by-line');
const { createDomainSorter } = require('./lib/stable-sort-domain'); const { createDomainSorter } = require('./lib/stable-sort-domain');
const { traceSync, task } = require('./lib/trace-runner'); const { traceSync, task } = require('./lib/trace-runner');
const { getGorhillPublicSuffixPromise } = require('./lib/get-gorhill-publicsuffix'); const { getGorhillPublicSuffixPromise } = require('./lib/get-gorhill-publicsuffix');
const { createCachedGorhillGetDomain } = require('./lib/cached-tld-parse'); const tldts = require('tldts');
/** Whitelists */ /** Whitelists */
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
@ -131,8 +131,6 @@ const buildRejectDomainSet = task(__filename, async () => {
console.log(`Start deduping from black keywords/suffixes! (${previousSize})`); console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
console.time('* Dedupe from black keywords/suffixes'); console.time('* Dedupe from black keywords/suffixes');
const kwfilter = createKeywordFilter(domainKeywordsSet);
const trie1 = Trie.from(domainSets); const trie1 = Trie.from(domainSets);
domainSuffixSet.forEach(suffix => { domainSuffixSet.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f)); trie1.find(suffix, true).forEach(f => domainSets.delete(f));
@ -141,6 +139,9 @@ const buildRejectDomainSet = task(__filename, async () => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f)); trie1.find(suffix, true).forEach(f => domainSets.delete(f));
}); });
// remove pre-defined enforced blacklist from whitelist
const kwfilter = createKeywordFilter(domainKeywordsSet);
// Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
const trieWhite = Trie.from(filterRuleWhitelistDomainSets); const trieWhite = Trie.from(filterRuleWhitelistDomainSets);
for (const domain of domainSets) { for (const domain of domainSets) {
@ -171,19 +172,18 @@ const buildRejectDomainSet = task(__filename, async () => {
console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`); console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);
// Create reject stats // Create reject stats
const getDomain = createCachedGorhillGetDomain(gorhill);
/** @type {[string, number][]} */ /** @type {[string, number][]} */
const rejectDomainsStats = traceSync( const rejectDomainsStats = traceSync(
'* Collect reject domain stats', '* Collect reject domain stats',
() => Object.entries( () => Object.entries(
dudupedDominArray.reduce((acc, cur) => { dudupedDominArray.reduce((acc, cur) => {
const suffix = getDomain(cur); const suffix = tldts.getDomain(cur, { allowPrivateDomains: false });
if (suffix) { if (suffix) {
acc[suffix] = (acc[suffix] ?? 0) + 1; acc[suffix] = (acc[suffix] ?? 0) + 1;
} }
return acc; return acc;
}, {}) }, {})
).filter(a => a[1] > 2).sort((a, b) => { ).filter(a => a[1] > 10).sort((a, b) => {
const t = b[1] - a[1]; const t = b[1] - a[1];
if (t !== 0) { if (t !== 0) {
return t; return t;

View File

@ -5,6 +5,7 @@ const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-te
const { NetworkFilter } = require('@cliqz/adblocker'); const { NetworkFilter } = require('@cliqz/adblocker');
const { processLine } = require('./process-line'); const { processLine } = require('./process-line');
const { performance } = require('perf_hooks'); const { performance } = require('perf_hooks');
const { getGorhillPublicSuffixPromise } = require('./get-gorhill-publicsuffix');
const DEBUG_DOMAIN_TO_FIND = null; // example.com | null const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
let foundDebugDomain = false; let foundDebugDomain = false;
@ -22,14 +23,12 @@ const warnOnce = (url, isWhite, ...message) => {
const normalizeDomain = (domain) => { const normalizeDomain = (domain) => {
if (!domain) return null; if (!domain) return null;
const { isIcann, isPrivate, hostname, isIp } = tldts.parse(domain); const parsed = tldts.parse(domain);
if (isIp) return null; if (parsed.isIp) return null;
if (isIcann || isPrivate) { if (parsed.isIcann || parsed.isPrivate) {
if (hostname?.[0] === '.') { const h = parsed.hostname;
return hostname.slice(1); return h?.[0] === '.' ? h.slice(1) : h;
}
return hostname;
} }
return null; return null;
@ -122,51 +121,51 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
/** @type Set<string> */ /** @type Set<string> */
const blacklistDomainSets = new Set(); const blacklistDomainSets = new Set();
const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => { /**
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) { * @param {string} domainToBeAddedToBlack
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); * @param {boolean} isSubDomain
foundDebugDomain = true; */
} const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
if (isSubDomain && domainToBeAddedToBlack[0] !== '.') { if (isSubDomain && domainToBeAddedToBlack[0] !== '.') {
blacklistDomainSets.add(`.${domainToBeAddedToBlack}`); blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
} else { } else {
blacklistDomainSets.add(domainToBeAddedToBlack); blacklistDomainSets.add(domainToBeAddedToBlack);
} }
}; };
const addToBlackList = DEBUG_DOMAIN_TO_FIND == null /**
? __addToBlackList * @param {string} domainToBeAddedToWhite
: (domainToBeAddedToBlack, isSubDomain) => { * @param {boolean} [isSubDomain]
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) { */
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); const addToWhiteList = (domainToBeAddedToWhite, isSubDomain = true) => {
foundDebugDomain = true; if (isSubDomain && domainToBeAddedToWhite[0] !== '.') {
} blacklistDomainSets.add(`.${domainToBeAddedToWhite}`);
__addToBlackList(domainToBeAddedToBlack, isSubDomain); } else {
}; blacklistDomainSets.add(domainToBeAddedToWhite);
}
const __addToWhiteList = (domainToBeAddedToWhite) => {
whitelistDomainSets.add(domainToBeAddedToWhite);
}; };
const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
? __addToWhiteList
: (domainToBeAddedToWhite) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
__addToWhiteList(domainToBeAddedToWhite);
};
let downloadTime = 0; let downloadTime = 0;
const gorhill = await getGorhillPublicSuffixPromise();
const lineCb = (line) => { const lineCb = (line) => {
const result = parse(line, includeThirdParties); const result = parse(line, includeThirdParties, gorhill);
if (result) { if (result) {
const flag = result[1]; const flag = result[1];
const hostname = result[0]; const hostname = result[0];
if (DEBUG_DOMAIN_TO_FIND) {
if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), flag === 0 || flag === -1, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
}
switch (flag) { switch (flag) {
case 0: case 0:
addToWhiteList(hostname); addToWhiteList(hostname, true);
break;
case -1:
addToWhiteList(hostname, false);
break; break;
case 1: case 1:
addToBlackList(hostname, false); addToBlackList(hostname, false);
@ -183,7 +182,8 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
if (!fallbackUrls || fallbackUrls.length === 0) { if (!fallbackUrls || fallbackUrls.length === 0) {
const downloadStart = performance.now(); const downloadStart = performance.now();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) { for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
lineCb(line.trim()); // don't trim here
lineCb(line);
} }
downloadTime = performance.now() - downloadStart; downloadTime = performance.now() - downloadStart;
} else { } else {
@ -202,7 +202,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
return text; return text;
}) })
) )
).split('\n').map(line => line.trim()); ).split('\n');
} catch (e) { } catch (e) {
console.log(`Download Rule for [${filterRulesUrl}] failed`); console.log(`Download Rule for [${filterRulesUrl}] failed`);
throw e; throw e;
@ -210,7 +210,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
downloadTime = performance.now() - downloadStart; downloadTime = performance.now() - downloadStart;
for (let i = 0, len = filterRules.length; i < len; i++) { for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i].trim()); lineCb(filterRules[i]);
} }
} }
@ -230,35 +230,44 @@ const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)
/** /**
* @param {string} $line * @param {string} $line
* @param {boolean} includeThirdParties * @param {boolean} includeThirdParties
* @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain * @param {import('gorhill-publicsuffixlist').default} gorhill
* @returns {null | [string, 0 | 1 | 2 | -1]} - 0 white include subdomain, 1 black abosulte, 2 black include subdomain, -1 white
*/ */
function parse($line, includeThirdParties) { function parse($line, includeThirdParties, gorhill) {
if (
// doesn't include
!$line.includes('.') // rule with out dot can not be a domain
// includes
|| $line.includes('!')
|| $line.includes('?')
|| $line.includes('*')
|| $line.includes('[')
|| $line.includes('(')
|| $line.includes(']')
|| $line.includes(')')
|| $line.includes(',')
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test($line)
) {
return null;
}
const line = $line.trim(); const line = $line.trim();
const len = line.length;
if (len === 0) {
return null;
}
const firstChar = line[0];
const lastChar = line[len - 1];
if ( if (
line === '' len === 0
|| line[0] === '/' || firstChar === '/'
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
// doesn't include
|| !line.includes('.') // rule with out dot can not be a domain
// includes
// || line.includes('#')
|| line.includes('!')
|| line.includes('?')
|| line.includes('*')
// || line.includes('=')
|| line.includes('[')
|| line.includes('(')
|| line.includes(']')
|| line.includes(')')
|| line.includes(',')
// || line.includes('~')
// || line.includes('&')
// || line.includes('%')
// ends with // ends with
|| line.endsWith('.') || lastChar === '.' // || line.endsWith('.')
|| line.endsWith('-') || lastChar === '-' // || line.endsWith('-')
|| line.endsWith('_') || lastChar === '_' // || line.endsWith('_')
// special modifier // special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line) || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
|| ((line.includes('/') || line.includes(':')) && !line.includes('://')) || ((line.includes('/') || line.includes(':')) && !line.includes('://'))
@ -286,52 +295,82 @@ function parse($line, includeThirdParties) {
} }
if ( if (
filter.hasHostname() // must have filter.hostname // filter.hasHostname() // must have
&& filter.isPlain() && filter.isPlain()
&& (!filter.isRegex()) && (!filter.isRegex())
&& (!filter.isFullRegex()) && (!filter.isFullRegex())
) { ) {
const hostname = normalizeDomain(filter.getHostname()); if (!gorhill.getDomain(filter.hostname)) {
if (hostname) { return null;
if (filter.isException() || filter.isBadFilter()) { }
return [hostname, 0]; const hostname = normalizeDomain(filter.hostname);
} if (!hostname) {
if (filter.firstParty() === filter.thirdParty()) { return null;
}
if (filter.isException() || filter.isBadFilter()) {
return [hostname, 0];
}
const _1p = filter.firstParty();
const _3p = filter.thirdParty();
if (_1p === _3p) {
return [hostname, 2];
}
if (_3p) {
if (includeThirdParties) {
return [hostname, 2]; return [hostname, 2];
} }
if (filter.thirdParty()) { return null;
if (includeThirdParties) { }
return [hostname, 2]; if (_1p) {
}
return null;
}
if (filter.firstParty()) {
return null;
}
} else {
return null; return null;
} }
} }
} }
/**
* abnormal filter that can not be parsed by NetworkFilter
*/
if (line.includes('$third-party') || line.includes('$frame')) { if (line.includes('$third-party') || line.includes('$frame')) {
/*
* `.bbelements.com^$third-party`
* `://o0e.ru^$third-party`
*/
return null; return null;
} }
const lineEndsWithCaret = line.endsWith('^'); const lineEndsWithCaretOrCaretVerticalBar = (
const lineEndsWithCaretVerticalBar = line.endsWith('^|'); lastChar === '^'
|| (lastChar === '|' && line[len - 2] === '^')
);
if (line[0] === '@' && line[1] === '@') { // whitelist (exception)
if (firstChar === '@' && line[1] === '@') {
/**
* cname exceptional filter can not be parsed by NetworkFilter
*
* `@@||m.faz.net^$cname`
*
* Surge / Clash can't handle CNAME either, so we just ignore them
*/
if (line.endsWith('$cname')) { if (line.endsWith('$cname')) {
return null; return null;
} }
/**
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
* "$genericblock`" is also not supported by NetworkFilter
*
* `@@||cmechina.net^$genericblock`
* `@@|ftp.bmp.ovh^|`
* `@@|adsterra.com^|`
*/
if ( if (
// (line.startsWith('@@|') || line.startsWith('@@.')) // (line.startsWith('@@|') || line.startsWith('@@.'))
(line[2] === '|' || line[2] === '.') (line[2] === '|' || line[2] === '.')
&& ( && (
lineEndsWithCaret lineEndsWithCaretOrCaretVerticalBar
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$genericblock') || line.endsWith('$genericblock')
|| line.endsWith('$document') || line.endsWith('$document')
) )
@ -352,22 +391,29 @@ function parse($line, includeThirdParties) {
if (domain) { if (domain) {
return [domain, 0]; return [domain, 0];
} }
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
return null; return null;
} }
} }
if ( if (
line.startsWith('||') firstChar === '|' && line[1] === '|'
&& ( && (
lineEndsWithCaret lineEndsWithCaretOrCaretVerticalBar
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$cname') || line.endsWith('$cname')
) )
) { ) {
/**
* Some malformed filters can not be parsed by NetworkFilter:
*
* `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
*/
const _domain = line const _domain = line
.replace('||', '') // .replace('||', '')
.slice(2) // we already make sure line startsWith ||
.replace('^|', '') .replace('^|', '')
.replace('$cname', '') .replace('$cname', '')
.replaceAll('^', '') .replaceAll('^', '')
@ -382,20 +428,28 @@ function parse($line, includeThirdParties) {
return null; return null;
} }
const lineStartsWithSingleDot = line[0] === '.'; const lineStartsWithSingleDot = firstChar === '.';
if ( if (
lineStartsWithSingleDot lineStartsWithSingleDot
&& ( && lineEndsWithCaretOrCaretVerticalBar
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) { ) {
/**
* `.ay.delivery^`
* `.m.bookben.com^`
* `.wap.x4399.com^`
*/
const _domain = line const _domain = line
.slice(1) // remove prefix dot
.replace('^|', '') .replace('^|', '')
.replaceAll('^', '') .replaceAll('^', '')
.slice(1)
.trim(); .trim();
const suffix = gorhill.getPublicSuffix(_domain);
if (!gorhill.suffixInPSL(suffix)) {
// This exclude domain-like resource like `1.1.4.514.js`
return null;
}
const domain = normalizeDomain(_domain); const domain = normalizeDomain(_domain);
if (domain) { if (domain) {
return [domain, 2]; return [domain, 2];
@ -404,6 +458,12 @@ function parse($line, includeThirdParties) {
return null; return null;
} }
/**
* `|http://x.o2.pl^`
* `://mine.torrent.pw^`
* `://say.ac^`
*/
if ( if (
( (
line.startsWith('://') line.startsWith('://')
@ -412,10 +472,7 @@ function parse($line, includeThirdParties) {
|| line.startsWith('|http://') || line.startsWith('|http://')
|| line.startsWith('|https://') || line.startsWith('|https://')
) )
&& ( && lineEndsWithCaretOrCaretVerticalBar
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) { ) {
const _domain = line const _domain = line
.replace('|https://', '') .replace('|https://', '')
@ -431,33 +488,54 @@ function parse($line, includeThirdParties) {
if (domain) { if (domain) {
return [domain, 1]; return [domain, 1];
} }
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
return null; return null;
} }
if (line[0] !== '|' && lineEndsWithCaret) {
/**
* `_vmind.qqvideo.tc.qq.com^`
* `arketing.indianadunes.com^`
* `charlestownwyllie.oaklawnnonantum.com^`
* `-telemetry.officeapps.live.com^`
* `-tracker.biliapi.net`
* `_social_tracking.js^`
*/
if (firstChar !== '|' && lastChar === '^') {
const _domain = line.slice(0, -1); const _domain = line.slice(0, -1);
const domain = normalizeDomain(_domain); const domain = normalizeDomain(_domain);
if (domain) { if (domain) {
return [domain, 1]; return [domain, 1];
} }
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
return null; return null;
} }
/**
* `.3.n.2.2.l30.js`
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `@@://googleadservices.com^|`
*/
const tryNormalizeDomain = normalizeDomain(line); const tryNormalizeDomain = normalizeDomain(line);
if ( if (tryNormalizeDomain) {
tryNormalizeDomain if (tryNormalizeDomain === line) {
&& ( // the entire rule is domain
lineStartsWithSingleDot return [line, 2];
? tryNormalizeDomain.length === line.length - 1 }
: tryNormalizeDomain === line if (lineStartsWithSingleDot && tryNormalizeDomain === line.slice(1)) {
) // dot prefixed line has stripped
) { return [line, 2];
return [line, 2]; }
} }
if (!line.endsWith('.js')) { if (!line.endsWith('.js') && !line.endsWith('.css')) {
console.warn(' * [parse-filter E0010] can not parse:', line); console.warn(' * [parse-filter E0010] can not parse:', line);
} }