Perf: improve reject set dedupe performance

This commit is contained in:
SukkaW 2022-08-31 02:25:21 +08:00
parent 69c196c8a5
commit 39f3dacf6e
2 changed files with 45 additions and 49 deletions

View File

@ -148,24 +148,56 @@ const threads = require('os').cpus().length - 1;
// Dedupe domainSets
console.log(`Start deduping! (${previousSize})`);
const toBeRemoved = new Set();
for (const domain of domainSets) {
let isTobeRemoved = false;
for (const keyword of domainKeywordsSet) {
if (domain.includes(keyword) || keyword.includes(domain)) {
isTobeRemoved = true;
break;
}
}
if (!isTobeRemoved) {
for (const suffix of domainSuffixSet) {
if (domain.endsWith(suffix)) {
isTobeRemoved = true;
break;
}
}
}
if (!isTobeRemoved) {
for (const white of filterRuleWhitelistDomainSets) {
if (domain.includes(white) || white.includes(domain)) {
isTobeRemoved = true;
break;
}
}
}
if (isTobeRemoved) {
toBeRemoved.add(domain);
}
}
toBeRemoved.forEach((removed) => {
domainSets.delete(removed)
});
// Dedupe domainSets
console.log(`Deduped ${previousSize - domainSets.size} from black keywords and suffixes!`);
previousSize = domainSets.size;
// Dedupe domainSets
console.log(`Start deduping! (${previousSize})`);
const piscina = new Piscina({
filename: pathResolve(__dirname, 'worker/build-reject-domainset-worker.js'),
workerData: [...domainSets]
});
(await Promise.all([
piscina.run(
{ keywords: domainKeywordsSet, suffixes: domainSuffixSet },
{ name: 'dedupeKeywords' }
),
piscina.run(
{ whiteList: filterRuleWhitelistDomainSets },
{ name: 'whitelisted' }
)
])).forEach(set => {
set.forEach(i => domainSets.delete(i));
});
(await Promise.all(
Array.from(domainSets)
.reduce((result, element, index) => {

View File

@ -43,39 +43,3 @@ exports.dedupe = ({ chunk }) => {
return outputToBeRemoved;
};
exports.whitelisted = ({ whiteList }) => {
const outputToBeRemoved = new Set();
for (const domain of workerData) {
for (const white of whiteList) {
if (domain.includes(white) || white.includes(domain)) {
outputToBeRemoved.add(domain);
break;
}
}
}
return outputToBeRemoved;
};
exports.dedupeKeywords = ({ keywords, suffixes }) => {
const outputToBeRemoved = new Set();
for (const domain of workerData) {
for (const keyword of keywords) {
if (domain.includes(keyword) || keyword.includes(domain)) {
outputToBeRemoved.add(domain);
break;
}
}
for (const suffix of suffixes) {
if (domain.endsWith(suffix)) {
outputToBeRemoved.add(domain);
break;
}
}
}
return outputToBeRemoved;
}