Perf: improve performance of reject suffix/keyword deduping

This commit is contained in:
SukkaW 2024-01-21 23:53:12 +08:00
parent 80deff88f9
commit 725f26b428
6 changed files with 39 additions and 55 deletions

View File

@ -109,7 +109,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
)
];
return createRuleset(
return span.traceAsyncFn(() => createRuleset(
span,
title,
description,
@ -118,7 +118,7 @@ async function transformDomainset(parentSpan: Span, sourcePath: string, relative
'domainset',
path.resolve(outputSurgeDir, relativePath),
path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)
);
));
}
/**

View File

@ -4,12 +4,12 @@ import path from 'path';
import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter';
import { createTrie } from './lib/trie';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLIST, DOMAIN_LISTS } from './lib/reject-data-source';
import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './lib/reject-data-source';
import { createRuleset, compareAndWriteFile } from './lib/create-file';
import { processLine } from './lib/process-line';
import { domainDeduper } from './lib/domain-deduper';
import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine } from './lib/fetch-text-by-line';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
@ -63,25 +63,10 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
setAddFromArray(domainSets, purePhishingDomains);
}),
childSpan.traceChild('process reject_sukka.conf').traceAsyncFn(async () => {
for await (const l of readFileByLine(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf'))) {
const line = processLine(l);
if (!line) continue;
domainSets.add(line);
}
setAddFromArray(domainSets, await readFileIntoProcessedArray(path.resolve(import.meta.dir, '../Source/domainset/reject_sukka.conf')));
})
]);
// remove pre-defined enforced blacklist from whitelist
const trie0 = createTrie(filterRuleWhitelistDomainSets);
for (let i = 0, len1 = PREDEFINED_ENFORCED_BACKLIST.length; i < len1; i++) {
const enforcedBlack = PREDEFINED_ENFORCED_BACKLIST[i];
const found = trie0.find(enforcedBlack);
for (let j = 0, len2 = found.length; j < len2; j++) {
filterRuleWhitelistDomainSets.delete(found[j]);
}
}
return shouldStop;
});
@ -116,25 +101,22 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
});
filterRuleWhitelistDomainSets.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
if (suffix[0] === '.') {
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
domainSets.delete(suffix.slice(1));
} else {
// If `g.msn.com` is whitelisted, then `.g.msn.com` should be removed from domain set
domainSets.delete(`.${suffix}`);
}
});
// remove pre-defined enforced blacklist from whitelist
const kwfilter = createKeywordFilter(domainKeywordsSet);
// handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
for (const domain of domainSets) {
if (domain[0] === '.') {
if (filterRuleWhitelistDomainSets.has(domain)) {
domainSets.delete(domain);
continue;
}
} else if (filterRuleWhitelistDomainSets.has(`.${domain}`)) {
domainSets.delete(domain);
continue;
}
// Remove keyword
if (kwfilter.search(domain)) {
if (kwfilter(domain)) {
domainSets.delete(domain);
}
}

View File

@ -77,7 +77,7 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {
build();
const search = (text: string) => {
return (text: string) => {
let node: Node | undefined = root;
for (let i = 0, textLen = text.length; i < textLen; i++) {
@ -96,10 +96,6 @@ const createKeywordFilter = (keys: string[] | Set<string>) => {
return false;
};
return {
search
};
};
export default createKeywordFilter;

View File

@ -99,17 +99,19 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
SetAdd(domainSet, domainSet2);
}
span.traceChild('whitelisting phishing domains').traceSyncFn(() => {
const trieForRemovingWhiteListed = createTrie(domainSet);
span.traceChild('whitelisting phishing domains').traceSyncFn((parentSpan) => {
const trieForRemovingWhiteListed = parentSpan.traceChild('create trie for whitelisting').traceSyncFn(() => createTrie(domainSet));
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
for (let j = 0, len2 = found.length; j < len2; j++) {
domainSet.delete(found[j]);
return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
const found = trieForRemovingWhiteListed.find(`.${white}`, true);
for (let j = 0, len2 = found.length; j < len2; j++) {
domainSet.delete(found[j]);
}
domainSet.delete(white);
}
domainSet.delete(white);
}
});
});
const domainCountMap: Record<string, number> = {};
@ -177,11 +179,15 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
}
});
const results = span.traceChild('get final phishing results').traceSyncFn(
() => Object.entries(domainCountMap)
.filter(entries => entries[1] >= 5)
.map(entries => entries[0])
);
const results = span.traceChild('get final phishing results').traceSyncFn(() => {
const results: string[] = [];
for (const domain in domainCountMap) {
if (domainCountMap[domain] > 5) {
results.push(domain);
}
}
return results;
});
return [results, domainSet] as const;
});

View File

@ -211,10 +211,6 @@ export const PREDEFINED_WHITELIST = [
'pstmrk.it'
];
export const PREDEFINED_ENFORCED_BACKLIST = [
'telemetry.mozilla.org'
];
export const PREDEFINED_ENFORCED_WHITELIST = [
'godaddysites.com',
'web.app',

View File

@ -43,6 +43,10 @@ DOMAIN-SUFFIX,pantheonsite.io
DOMAIN-SUFFIX,sitebeat.crazydomains.com
# >> Snowplow Analytics (publicsuffix)
DOMAIN-SUFFIX,try-snowplow.com
# >> Mozilla Telemetry (Enforcing)
DOMAIN-SUFFIX,telemetry-coverage.mozilla.org
DOMAIN-SUFFIX,telemetry.mozilla.org
DOMAIN-SUFFIX,incoming-telemetry.thunderbird.net
# >> Phishing
DOMAIN-SUFFIX,gofenews.com