Perf: fastest dedupe

This commit is contained in:
SukkaW 2023-07-07 15:07:45 +08:00
parent 9dd9e4aa05
commit b659bff079
2 changed files with 22 additions and 30 deletions

View File

@ -183,15 +183,17 @@ const domainSuffixSet = new Set();
const START_TIME = Date.now(); const START_TIME = Date.now();
const domainSetsArray = Array.from(domainSets); const domainSetsArray = Array.from(domainSets);
const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
const piscina = new Piscina({ const piscina = new Piscina({
filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'), filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
workerData: preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)), workerData,
idleTimeout: 50, idleTimeout: 50,
minThreads: threads, minThreads: threads,
maxThreads: threads maxThreads: threads
}); });
console.log(preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)).length); console.log(workerData.length);
console.log(`Launching ${threads} threads...`); console.log(`Launching ${threads} threads...`);

View File

@ -1,8 +1,9 @@
// @ts-check // @ts-check
const Piscina = require('piscina'); const Piscina = require('piscina');
const Trie = require('../lib/trie');
// const { isCI } = require('ci-info'); // const { isCI } = require('ci-info');
/** @type {string[]} */ /** @type {string[]} */
const fullsetDomainStartsWithADot = Piscina.workerData const fullsetDomainStartsWithADot = Piscina.workerData;
const totalLen = fullsetDomainStartsWithADot.length; const totalLen = fullsetDomainStartsWithADot.length;
const DOT = '.'; const DOT = '.';
@ -15,38 +16,27 @@ module.exports = ({ chunk }) => {
const chunkLength = chunk.length; const chunkLength = chunk.length;
const outputToBeRemoved = new Int8Array(chunkLength); const outputToBeRemoved = new Int8Array(chunkLength);
for (let i = 0; i < chunkLength; i++) { const trie = Trie.from(chunk);
const domainFromInputChunk = chunk[i];
const domainFromInputLen = domainFromInputChunk.length;
for (let j = 0; j < totalLen; j++) { for (let j = 0; j < totalLen; j++) {
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
// domainFromFullSet is always startsWith "."
if (domainStartsWithADotAndFromFullSet === domainFromInputChunk) continue;
const domainFromFullSetLen = domainStartsWithADotAndFromFullSet.length; const found = trie.find(domainStartsWithADotAndFromFullSet, false)
if (domainFromInputLen < domainFromFullSetLen) { if (found.length) {
if (domainFromInputLen + 1 !== domainFromFullSetLen) { found.forEach(f => {
continue; const index = chunk.indexOf(f);
if (index !== -1) {
outputToBeRemoved[index] = 1;
} }
})
}
// !domainFromInput.starsWith('.') && `.${domainFromInput}` === domainFromFullSet const a = domainStartsWithADotAndFromFullSet.slice(1);
if ( if (trie.has(a)) {
domainFromInputChunk[0] !== DOT const index = chunk.indexOf(a);
&& domainStartsWithADotAndFromFullSet.endsWith(domainFromInputChunk) if (index !== -1) {
) { outputToBeRemoved[index] = 1;
outputToBeRemoved[i] = 1;
// log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
break;
}
} else if (
domainFromInputLen > domainFromFullSetLen
&& domainFromInputChunk.endsWith(domainStartsWithADotAndFromFullSet)
) {
outputToBeRemoved[i] = 1;
// log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
break;
} }
} }
} }