From b659bff079c3e66cd695f1202803557c0da1789b Mon Sep 17 00:00:00 2001 From: SukkaW Date: Fri, 7 Jul 2023 15:07:45 +0800 Subject: [PATCH] Perf: fastest dedupe --- Build/build-reject-domainset.js | 6 ++- Build/worker/build-reject-domainset-worker.js | 46 ++++++++----------- 2 files changed, 22 insertions(+), 30 deletions(-) diff --git a/Build/build-reject-domainset.js b/Build/build-reject-domainset.js index 5a6e971b..81413e35 100644 --- a/Build/build-reject-domainset.js +++ b/Build/build-reject-domainset.js @@ -183,15 +183,17 @@ const domainSuffixSet = new Set(); const START_TIME = Date.now(); const domainSetsArray = Array.from(domainSets); + const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray); + const piscina = new Piscina({ filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'), - workerData: preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)), + workerData, idleTimeout: 50, minThreads: threads, maxThreads: threads }); - console.log(preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)).length); + console.log(workerData.length); console.log(`Launching ${threads} threads...`); diff --git a/Build/worker/build-reject-domainset-worker.js b/Build/worker/build-reject-domainset-worker.js index ef68ee29..a99260f4 100644 --- a/Build/worker/build-reject-domainset-worker.js +++ b/Build/worker/build-reject-domainset-worker.js @@ -1,8 +1,9 @@ // @ts-check const Piscina = require('piscina'); +const Trie = require('../lib/trie'); // const { isCI } = require('ci-info'); /** @type {string[]} */ -const fullsetDomainStartsWithADot = Piscina.workerData +const fullsetDomainStartsWithADot = Piscina.workerData; const totalLen = fullsetDomainStartsWithADot.length; const DOT = '.'; @@ -15,38 +16,27 @@ module.exports = ({ chunk }) => { const chunkLength = chunk.length; const outputToBeRemoved = new Int8Array(chunkLength); - for (let i = 0; i < chunkLength; i++) { - const domainFromInputChunk = chunk[i]; - const domainFromInputLen = domainFromInputChunk.length; + const trie = Trie.from(chunk); - for (let j = 0; j < totalLen; j++) { - const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; - // domainFromFullSet is always startsWith "." - if (domainStartsWithADotAndFromFullSet === domainFromInputChunk) continue; + for (let j = 0; j < totalLen; j++) { + const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; - const domainFromFullSetLen = domainStartsWithADotAndFromFullSet.length; + const found = trie.find(domainStartsWithADotAndFromFullSet, false) - if (domainFromInputLen < domainFromFullSetLen) { - if (domainFromInputLen + 1 !== domainFromFullSetLen) { - continue; + if (found.length) { + found.forEach(f => { + const index = chunk.indexOf(f); + if (index !== -1) { + outputToBeRemoved[index] = 1; } + }) + } - // !domainFromInput.starsWith('.') && `.${domainFromInput}` === domainFromFullSet - if ( - domainFromInputChunk[0] !== DOT - && domainStartsWithADotAndFromFullSet.endsWith(domainFromInputChunk) - ) { - outputToBeRemoved[i] = 1; - // log(domainFromInputChunk, domainStartsWithADotAndFromFullSet) - break; - } - } else if ( - domainFromInputLen > domainFromFullSetLen - && domainFromInputChunk.endsWith(domainStartsWithADotAndFromFullSet) - ) { - outputToBeRemoved[i] = 1; - // log(domainFromInputChunk, domainStartsWithADotAndFromFullSet) - break; + const a = domainStartsWithADotAndFromFullSet.slice(1); + if (trie.has(a)) { + const index = chunk.indexOf(a); + if (index !== -1) { + outputToBeRemoved[index] = 1; } } }