Perf: make reject list build faster

This commit is contained in:
SukkaW 2023-07-07 19:39:50 +08:00
parent b659bff079
commit 4d0a5260ca
4 changed files with 80 additions and 177 deletions

View File

@ -2,12 +2,9 @@
const { promises: fsPromises } = require('fs'); const { promises: fsPromises } = require('fs');
const fse = require('fs-extra'); const fse = require('fs-extra');
const { resolve: pathResolve } = require('path'); const { resolve: pathResolve } = require('path');
const Piscina = require('piscina');
const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter'); const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter');
const cpuCount = require('os').cpus().length;
const { isCI } = require('ci-info');
const threads = isCI ? cpuCount : cpuCount / 2;
const { getDomain } = require('tldts'); const { getDomain } = require('tldts');
const Trie = require('./lib/trie');
const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } = require('./lib/reject-data-source'); const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } = require('./lib/reject-data-source');
const { withBannerArray } = require('./lib/with-banner'); const { withBannerArray } = require('./lib/with-banner');
@ -30,15 +27,14 @@ const domainSuffixSet = new Set();
console.time('* Download and process Hosts'); console.time('* Download and process Hosts');
// Parse from remote hosts & domain lists // Parse from remote hosts & domain lists
(await Promise.all( (await Promise.all(HOSTS.map(entry => processHosts(entry[0], entry[1]))))
HOSTS.map(entry => processHosts(entry[0], entry[1])) .forEach(hosts => {
)).forEach(hosts => { hosts.forEach(host => {
hosts.forEach(host => { if (host) {
if (host) { domainSets.add(host);
domainSets.add(host); }
} });
}); });
});
console.timeEnd('* Download and process Hosts'); console.timeEnd('* Download and process Hosts');
@ -167,8 +163,31 @@ const domainSuffixSet = new Set();
console.log(`Start deduping from black keywords/suffixes! (${previousSize})`); console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
console.time(`* Dedupe from black keywords/suffixes`); console.time(`* Dedupe from black keywords/suffixes`);
const trie1 = Trie.from(Array.from(domainSets));
domainSuffixSet.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
});
filterRuleWhitelistDomainSets.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
});
// Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
const trieWhite = Trie.from(Array.from(filterRuleWhitelistDomainSets));
for (const domain of domainSets) { for (const domain of domainSets) {
if (isMatchKeyword(domain) || isMatchSuffix(domain) || isInWhiteList(domain)) { if (domain[0] !== '.' && trieWhite.has(`.${domain}`)) {
domainSets.delete(domain);
continue;
}
if (domain[0] === '.') {
const found = trieWhite.find(domain);
if (found.length > 0) {
domainSets.delete(domain);
continue;
}
}
// Remove keyword
if (isMatchKeyword(domain)) {
domainSets.delete(domain); domainSets.delete(domain);
} }
} }
@ -183,44 +202,28 @@ const domainSuffixSet = new Set();
const START_TIME = Date.now(); const START_TIME = Date.now();
const domainSetsArray = Array.from(domainSets); const domainSetsArray = Array.from(domainSets);
const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray); const trie2 = Trie.from(domainSetsArray);
const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
console.log(fullsetDomainStartsWithADot.length);
const piscina = new Piscina({ for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) {
filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'), const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
workerData, const found = trie2.find(domainStartsWithADotAndFromFullSet, false);
idleTimeout: 50, if (found.length) {
minThreads: threads, found.forEach(f => {
maxThreads: threads domainSets.delete(f);
}); })
}
console.log(workerData.length); const a = domainStartsWithADotAndFromFullSet.slice(1);
if (trie2.has(a)) {
console.log(`Launching ${threads} threads...`); domainSets.delete(a);
}
const tasksArray = domainSetsArray.reduce((result, element, index) => { }
const chunk = index % threads;
result[chunk] ??= [];
result[chunk].push(element);
return result;
}, /** @type {string[][]} */([]));
(await Promise.all(
tasksArray.map(chunk => piscina.run({ chunk }))
)).forEach((result, taskIndex) => {
const chunk = tasksArray[taskIndex];
for (let i = 0, len = result.length; i < len; i++) {
if (result[i]) {
domainSets.delete(chunk[i]);
}
}
});
console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`); console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
console.log(`Deduped ${previousSize - domainSets.size} rules!`); console.log(`Deduped ${previousSize - domainSets.size} rules!`);
await piscina.destroy();
console.time('* Write reject.conf'); console.time('* Write reject.conf');
const sorter = (a, b) => { const sorter = (a, b) => {
@ -264,9 +267,6 @@ const domainSuffixSet = new Set();
console.timeEnd('* Write reject.conf'); console.timeEnd('* Write reject.conf');
console.timeEnd('Total Time - build-reject-domain-set'); console.timeEnd('Total Time - build-reject-domain-set');
if (piscina.queueSize === 0) {
process.exit(0);
}
})(); })();
/** /**
@ -281,34 +281,3 @@ function isMatchKeyword(domain) {
return false; return false;
} }
/**
* @param {string} domain
*/
function isMatchSuffix(domain) {
for (const suffix of domainSuffixSet) {
if (domain.endsWith(suffix)) {
return true;
}
}
return false;
}
/**
* @param {string} domain
*/
function isInWhiteList(domain) {
for (const white of filterRuleWhitelistDomainSets) {
if (domain === white || domain.endsWith(white)) {
return true;
}
if (white.endsWith(domain)) {
// If a whole domain is in blacklist but a subdomain is in whitelist
// We have no choice but to remove the whole domain from blacklist
return true;
}
}
return false;
}

View File

@ -397,7 +397,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
*/ */
function preprocessFullDomainSetBeforeUsedAsWorkerData(data) { function preprocessFullDomainSetBeforeUsedAsWorkerData(data) {
return data return data
.filter(domain => domain.charCodeAt(0) === 46) .filter(domain => domain[0] === '.')
.sort((a, b) => a.length - b.length); .sort((a, b) => a.length - b.length);
} }

View File

@ -129,7 +129,7 @@ class Trie {
/** /**
* Method used to delete a prefix from the trie. * Method used to delete a prefix from the trie.
* *
* @param {string|array} suffix - Prefix to delete. * @param {string} suffix - Prefix to delete.
* @return {boolean} * @return {boolean}
*/ */
delete(suffix) { delete(suffix) {
@ -198,66 +198,45 @@ class Trie {
} }
/** /**
* Method returning an iterator over the trie's prefixes. * @return {string[]}
*
* @param {string|array} [prefix] - Optional starting prefix.
* @return {Iterator}
*/ */
// prefixes(prefix) { dump() {
// let node = this.root; let node = this.root;
// const nodeStack = []; const nodeStack = [];
// const prefixStack = []; const prefixStack = [];
// let token; // Resolving initial prefix
// let i; const prefix = '';
// let l;
// const isString = this.mode === 'string'; nodeStack.push(node);
prefixStack.push(prefix);
// // Resolving initial prefix /** @type {string[]} */
// if (prefix) { const results = [];
// for (i = 0, l = prefix.length; i < l; i++) {
// token = prefix[i];
// node = node[token];
// // If the prefix does not exist, we return an empty iterator let currentNode;
// if (typeof node === 'undefined') let currentPrefix;
// return Iterator.empty(); let hasValue = false;
// } let k;
// }
// else {
// prefix = isString ? '' : [];
// }
// nodeStack.push(node); while (nodeStack.length) {
// prefixStack.push(prefix); currentNode = nodeStack.pop();
currentPrefix = prefixStack.pop();
// return new Iterator(() => { for (k in currentNode) {
// let currentNode; if (k === SENTINEL) {
// let currentPrefix; hasValue = true;
// let hasValue = false; continue;
// let k; }
// while (nodeStack.length) { nodeStack.push(currentNode[k]);
// currentNode = nodeStack.pop(); prefixStack.push(k + currentPrefix);
// currentPrefix = prefixStack.pop(); }
// for (k in currentNode) { if (hasValue) results.push(currentPrefix);
// if (k === SENTINEL) { }
// hasValue = true;
// continue;
// }
// nodeStack.push(currentNode[k]); return results;
// prefixStack.push(isString ? currentPrefix + k : currentPrefix.concat(k)); }
// }
// if (hasValue)
// return { done: false, value: currentPrefix };
// }
// return { done: true };
// });
// }
/** /**
* Convenience known methods. * Convenience known methods.

View File

@ -1,45 +0,0 @@
// @ts-check
const Piscina = require('piscina');
const Trie = require('../lib/trie');
// const { isCI } = require('ci-info');
/** @type {string[]} */
const fullsetDomainStartsWithADot = Piscina.workerData;
const totalLen = fullsetDomainStartsWithADot.length;
const DOT = '.';
// const log = isCI ? () => { } : console.log.bind(console);
/**
* @param {{ chunk: string[] }} param0
*/
module.exports = ({ chunk }) => {
const chunkLength = chunk.length;
const outputToBeRemoved = new Int8Array(chunkLength);
const trie = Trie.from(chunk);
for (let j = 0; j < totalLen; j++) {
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
const found = trie.find(domainStartsWithADotAndFromFullSet, false)
if (found.length) {
found.forEach(f => {
const index = chunk.indexOf(f);
if (index !== -1) {
outputToBeRemoved[index] = 1;
}
})
}
const a = domainStartsWithADotAndFromFullSet.slice(1);
if (trie.has(a)) {
const index = chunk.indexOf(a);
if (index !== -1) {
outputToBeRemoved[index] = 1;
}
}
}
return Piscina.move(outputToBeRemoved);
};