Perf: fastest dedupe

This commit is contained in:
SukkaW 2023-07-07 15:07:45 +08:00
parent 9dd9e4aa05
commit b659bff079
2 changed files with 22 additions and 30 deletions

View File

@ -183,15 +183,17 @@ const domainSuffixSet = new Set();
const START_TIME = Date.now();
const domainSetsArray = Array.from(domainSets);
const workerData = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
const piscina = new Piscina({
filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
workerData: preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)),
workerData,
idleTimeout: 50,
minThreads: threads,
maxThreads: threads
});
console.log(preprocessFullDomainSetBeforeUsedAsWorkerData(Array.from(domainSetsArray)).length);
console.log(workerData.length);
console.log(`Launching ${threads} threads...`);

View File

@ -1,8 +1,9 @@
// @ts-check
const Piscina = require('piscina');
const Trie = require('../lib/trie');
// const { isCI } = require('ci-info');
/** @type {string[]} */
const fullsetDomainStartsWithADot = Piscina.workerData
const fullsetDomainStartsWithADot = Piscina.workerData;
const totalLen = fullsetDomainStartsWithADot.length;
const DOT = '.';
@ -15,38 +16,27 @@ module.exports = ({ chunk }) => {
const chunkLength = chunk.length;
const outputToBeRemoved = new Int8Array(chunkLength);
for (let i = 0; i < chunkLength; i++) {
const domainFromInputChunk = chunk[i];
const domainFromInputLen = domainFromInputChunk.length;
const trie = Trie.from(chunk);
for (let j = 0; j < totalLen; j++) {
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
// domainFromFullSet is always startsWith "."
if (domainStartsWithADotAndFromFullSet === domainFromInputChunk) continue;
for (let j = 0; j < totalLen; j++) {
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
const domainFromFullSetLen = domainStartsWithADotAndFromFullSet.length;
const found = trie.find(domainStartsWithADotAndFromFullSet, false)
if (domainFromInputLen < domainFromFullSetLen) {
if (domainFromInputLen + 1 !== domainFromFullSetLen) {
continue;
if (found.length) {
found.forEach(f => {
const index = chunk.indexOf(f);
if (index !== -1) {
outputToBeRemoved[index] = 1;
}
})
}
// !domainFromInput.starsWith('.') && `.${domainFromInput}` === domainFromFullSet
if (
domainFromInputChunk[0] !== DOT
&& domainStartsWithADotAndFromFullSet.endsWith(domainFromInputChunk)
) {
outputToBeRemoved[i] = 1;
// log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
break;
}
} else if (
domainFromInputLen > domainFromFullSetLen
&& domainFromInputChunk.endsWith(domainStartsWithADotAndFromFullSet)
) {
outputToBeRemoved[i] = 1;
// log(domainFromInputChunk, domainStartsWithADotAndFromFullSet)
break;
const a = domainStartsWithADotAndFromFullSet.slice(1);
if (trie.has(a)) {
const index = chunk.indexOf(a);
if (index !== -1) {
outputToBeRemoved[index] = 1;
}
}
}