From 4d0a5260ca2b71c914538734d998a7387dbddaed Mon Sep 17 00:00:00 2001 From: SukkaW Date: Fri, 7 Jul 2023 19:39:50 +0800 Subject: [PATCH] Perf: make reject list build faster --- Build/build-reject-domainset.js | 127 +++++++----------- Build/lib/parse-filter.js | 2 +- Build/lib/trie.js | 83 +++++------- Build/worker/build-reject-domainset-worker.js | 45 ------- 4 files changed, 80 insertions(+), 177 deletions(-) delete mode 100644 Build/worker/build-reject-domainset-worker.js diff --git a/Build/build-reject-domainset.js b/Build/build-reject-domainset.js index 81413e35..3d23e58d 100644 --- a/Build/build-reject-domainset.js +++ b/Build/build-reject-domainset.js @@ -2,12 +2,9 @@ const { promises: fsPromises } = require('fs'); const fse = require('fs-extra'); const { resolve: pathResolve } = require('path'); -const Piscina = require('piscina'); const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter'); -const cpuCount = require('os').cpus().length; -const { isCI } = require('ci-info'); -const threads = isCI ? cpuCount : cpuCount / 2; const { getDomain } = require('tldts'); +const Trie = require('./lib/trie'); const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST } = require('./lib/reject-data-source'); const { withBannerArray } = require('./lib/with-banner'); @@ -30,15 +27,14 @@ const domainSuffixSet = new Set(); console.time('* Download and process Hosts'); // Parse from remote hosts & domain lists - (await Promise.all( - HOSTS.map(entry => processHosts(entry[0], entry[1])) - )).forEach(hosts => { - hosts.forEach(host => { - if (host) { - domainSets.add(host); - } + (await Promise.all(HOSTS.map(entry => processHosts(entry[0], entry[1])))) + .forEach(hosts => { + hosts.forEach(host => { + if (host) { + domainSets.add(host); + } + }); }); - }); console.timeEnd('* Download and process Hosts'); @@ -167,8 +163,31 @@ const domainSuffixSet = new Set(); console.log(`Start deduping from black keywords/suffixes! (${previousSize})`); console.time(`* Dedupe from black keywords/suffixes`); + const trie1 = Trie.from(Array.from(domainSets)); + domainSuffixSet.forEach(suffix => { + trie1.find(suffix, true).forEach(f => domainSets.delete(f)); + }); + filterRuleWhitelistDomainSets.forEach(suffix => { + trie1.find(suffix, true).forEach(f => domainSets.delete(f)); + }); + + // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) + const trieWhite = Trie.from(Array.from(filterRuleWhitelistDomainSets)); for (const domain of domainSets) { - if (isMatchKeyword(domain) || isMatchSuffix(domain) || isInWhiteList(domain)) { + if (domain[0] !== '.' && trieWhite.has(`.${domain}`)) { + domainSets.delete(domain); + continue; + } + if (domain[0] === '.') { + const found = trieWhite.find(domain); + if (found.length > 0) { + domainSets.delete(domain); + continue; + } + } + + // Remove keyword + if (isMatchKeyword(domain)) { domainSets.delete(domain); } } @@ -183,44 +202,28 @@ const domainSuffixSet = new Set(); const START_TIME = Date.now(); const domainSetsArray = Array.from(domainSets); - const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray); + const trie2 = Trie.from(domainSetsArray); + const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray); + console.log(fullsetDomainStartsWithADot.length); - const piscina = new Piscina({ - filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'), - workerData, - idleTimeout: 50, - minThreads: threads, - maxThreads: threads - }); + for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) { + const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; + const found = trie2.find(domainStartsWithADotAndFromFullSet, false); + if (found.length) { + found.forEach(f => { + domainSets.delete(f); + }) + } - console.log(workerData.length); - - console.log(`Launching ${threads} threads...`); - - const tasksArray = domainSetsArray.reduce((result, element, index) => { - const chunk = index % threads; - result[chunk] ??= []; - - result[chunk].push(element); - return result; - }, /** @type {string[][]} */([])); - - (await Promise.all( - tasksArray.map(chunk => piscina.run({ chunk })) - )).forEach((result, taskIndex) => { - const chunk = tasksArray[taskIndex]; - for (let i = 0, len = result.length; i < len; i++) { - if (result[i]) { - domainSets.delete(chunk[i]); - } - } - }); + const a = domainStartsWithADotAndFromFullSet.slice(1); + if (trie2.has(a)) { + domainSets.delete(a); + } + } console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`); console.log(`Deduped ${previousSize - domainSets.size} rules!`); - await piscina.destroy(); - console.time('* Write reject.conf'); const sorter = (a, b) => { @@ -264,9 +267,6 @@ const domainSuffixSet = new Set(); console.timeEnd('* Write reject.conf'); console.timeEnd('Total Time - build-reject-domain-set'); - if (piscina.queueSize === 0) { - process.exit(0); - } })(); /** @@ -281,34 +281,3 @@ function isMatchKeyword(domain) { return false; } - -/** - * @param {string} domain - */ -function isMatchSuffix(domain) { - for (const suffix of domainSuffixSet) { - if (domain.endsWith(suffix)) { - return true; - } - } - - return false; -} - -/** - * @param {string} domain - */ -function isInWhiteList(domain) { - for (const white of filterRuleWhitelistDomainSets) { - if (domain === white || domain.endsWith(white)) { - return true; - } - if (white.endsWith(domain)) { - // If a whole domain is in blacklist but a subdomain is in whitelist - // We have no choice but to remove the whole domain from blacklist - return true; - } - } - - return false; -} diff --git a/Build/lib/parse-filter.js b/Build/lib/parse-filter.js index 4a7167ef..b0393265 100644 --- a/Build/lib/parse-filter.js +++ b/Build/lib/parse-filter.js @@ -397,7 +397,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart */ function preprocessFullDomainSetBeforeUsedAsWorkerData(data) { return data - .filter(domain => domain.charCodeAt(0) === 46) + .filter(domain => domain[0] === '.') .sort((a, b) => a.length - b.length); } diff --git a/Build/lib/trie.js b/Build/lib/trie.js index 6908c8a7..58fc1f4d 100644 --- a/Build/lib/trie.js +++ b/Build/lib/trie.js @@ -129,7 +129,7 @@ class Trie { /** * Method used to delete a prefix from the trie. * - * @param {string|array} suffix - Prefix to delete. + * @param {string} suffix - Prefix to delete. * @return {boolean} */ delete(suffix) { @@ -198,66 +198,45 @@ class Trie { } /** - * Method returning an iterator over the trie's prefixes. - * - * @param {string|array} [prefix] - Optional starting prefix. - * @return {Iterator} + * @return {string[]} */ - // prefixes(prefix) { - // let node = this.root; - // const nodeStack = []; - // const prefixStack = []; - // let token; - // let i; - // let l; + dump() { + let node = this.root; + const nodeStack = []; + const prefixStack = []; + // Resolving initial prefix + const prefix = ''; - // const isString = this.mode === 'string'; + nodeStack.push(node); + prefixStack.push(prefix); - // // Resolving initial prefix - // if (prefix) { - // for (i = 0, l = prefix.length; i < l; i++) { - // token = prefix[i]; - // node = node[token]; + /** @type {string[]} */ + const results = []; - // // If the prefix does not exist, we return an empty iterator - // if (typeof node === 'undefined') - // return Iterator.empty(); - // } - // } - // else { - // prefix = isString ? '' : []; - // } + let currentNode; + let currentPrefix; + let hasValue = false; + let k; - // nodeStack.push(node); - // prefixStack.push(prefix); + while (nodeStack.length) { + currentNode = nodeStack.pop(); + currentPrefix = prefixStack.pop(); - // return new Iterator(() => { - // let currentNode; - // let currentPrefix; - // let hasValue = false; - // let k; + for (k in currentNode) { + if (k === SENTINEL) { + hasValue = true; + continue; + } - // while (nodeStack.length) { - // currentNode = nodeStack.pop(); - // currentPrefix = prefixStack.pop(); + nodeStack.push(currentNode[k]); + prefixStack.push(k + currentPrefix); + } - // for (k in currentNode) { - // if (k === SENTINEL) { - // hasValue = true; - // continue; - // } + if (hasValue) results.push(currentPrefix); + } - // nodeStack.push(currentNode[k]); - // prefixStack.push(isString ? currentPrefix + k : currentPrefix.concat(k)); - // } - - // if (hasValue) - // return { done: false, value: currentPrefix }; - // } - - // return { done: true }; - // }); - // } + return results; + } /** * Convenience known methods. diff --git a/Build/worker/build-reject-domainset-worker.js b/Build/worker/build-reject-domainset-worker.js deleted file mode 100644 index a99260f4..00000000 --- a/Build/worker/build-reject-domainset-worker.js +++ /dev/null @@ -1,45 +0,0 @@ -// @ts-check -const Piscina = require('piscina'); -const Trie = require('../lib/trie'); -// const { isCI } = require('ci-info'); -/** @type {string[]} */ -const fullsetDomainStartsWithADot = Piscina.workerData; -const totalLen = fullsetDomainStartsWithADot.length; - -const DOT = '.'; - -// const log = isCI ? () => { } : console.log.bind(console); -/** - * @param {{ chunk: string[] }} param0 - */ -module.exports = ({ chunk }) => { - const chunkLength = chunk.length; - const outputToBeRemoved = new Int8Array(chunkLength); - - const trie = Trie.from(chunk); - - for (let j = 0; j < totalLen; j++) { - const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; - - const found = trie.find(domainStartsWithADotAndFromFullSet, false) - - if (found.length) { - found.forEach(f => { - const index = chunk.indexOf(f); - if (index !== -1) { - outputToBeRemoved[index] = 1; - } - }) - } - - const a = domainStartsWithADotAndFromFullSet.slice(1); - if (trie.has(a)) { - const index = chunk.indexOf(a); - if (index !== -1) { - outputToBeRemoved[index] = 1; - } - } - } - - return Piscina.move(outputToBeRemoved); -};