Perf: simplify white suffix dedupe

This commit is contained in:
SukkaW 2024-05-26 01:02:29 +08:00
parent 48b5f609dd
commit 02bff12245
3 changed files with 21 additions and 43 deletions

View File

@ -15,7 +15,7 @@ import * as tldts from 'tldts';
import { SHARED_DESCRIPTION } from './lib/constants'; import { SHARED_DESCRIPTION } from './lib/constants';
import { getPhishingDomains } from './lib/get-phishing-domains'; import { getPhishingDomains } from './lib/get-phishing-domains';
import * as SetHelpers from 'mnemonist/set'; import { add as SetAdd, subtract as SetSubstract } from 'mnemonist/set';
import { setAddFromArray } from './lib/set-add-from-array'; import { setAddFromArray } from './lib/set-add-from-array';
import { sort } from './lib/timsort'; import { sort } from './lib/timsort';
@ -23,7 +23,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
/** Whitelists */ /** Whitelists */
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
const domainSets = new Set<string>(); let domainSets = new Set<string>();
// Parse from AdGuard Filters // Parse from AdGuard Filters
const shouldStop = await span const shouldStop = await span
@ -33,9 +33,9 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
let shouldStop = false; let shouldStop = false;
await Promise.all([ await Promise.all([
// Parse from remote hosts & domain lists // Parse from remote hosts & domain lists
...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(hosts => SetHelpers.add(domainSets, hosts))), ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(hosts => SetAdd(domainSets, hosts))),
...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetHelpers.add(domainSets, hosts))), ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(hosts => SetAdd(domainSets, hosts))),
...ADGUARD_FILTERS.map(input => ( ...ADGUARD_FILTERS.map(input => (
typeof input === 'string' typeof input === 'string'
@ -58,7 +58,7 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
setAddFromArray(filterRuleWhitelistDomainSets, black); setAddFromArray(filterRuleWhitelistDomainSets, black);
}))), }))),
getPhishingDomains(childSpan).then(([purePhishingDomains, fullPhishingDomainSet]) => { getPhishingDomains(childSpan).then(([purePhishingDomains, fullPhishingDomainSet]) => {
SetHelpers.add(domainSets, fullPhishingDomainSet); SetAdd(domainSets, fullPhishingDomainSet);
setAddFromArray(domainSets, purePhishingDomains); setAddFromArray(domainSets, purePhishingDomains);
}), }),
childSpan.traceChildAsync('process reject_sukka.conf', async () => { childSpan.traceChildAsync('process reject_sukka.conf', async () => {
@ -94,22 +94,17 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
}); });
// Remove as many domains as possible from domainSets before creating trie // Remove as many domains as possible from domainSets before creating trie
SetHelpers.subtract(domainSets, filterRuleWhitelistDomainSets); SetSubstract(domainSets, filterRuleWhitelistDomainSets);
childSpan.traceChildSync('dedupe from white suffixes', () => { domainSets = new Set(childSpan.traceChildSync('dedupe from white suffixes', () => {
const trie = createTrie(domainSets); const trie = createTrie(domainSets, true, true);
filterRuleWhitelistDomainSets.forEach(suffix => { filterRuleWhitelistDomainSets.forEach(suffix => {
trie.substractSetInPlaceFromFound(suffix, domainSets); trie.whitelist(suffix);
if (suffix[0] === '.') {
domainSets.delete(suffix.slice(1));
domainSets.delete(suffix);
} else {
domainSets.delete(`.${suffix}`);
domainSets.delete(suffix);
}
}); });
});
return trie.dump();
}));
childSpan.traceChildSync('dedupe from black keywords', () => { childSpan.traceChildSync('dedupe from black keywords', () => {
const kwfilter = createKeywordFilter(domainKeywordsSet); const kwfilter = createKeywordFilter(domainKeywordsSet);

View File

@ -1,22 +1,11 @@
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import { processDomainLists } from './parse-filter'; import { processDomainLists } from './parse-filter';
import * as tldts from 'tldts'; import * as tldts from 'tldts';
import { createTrie } from './trie';
import { TTL } from './cache-filesystem'; import { TTL } from './cache-filesystem';
import { add as SetAdd } from 'mnemonist/set'; import { add as SetAdd } from 'mnemonist/set';
import type { Span } from '../trace'; import type { Span } from '../trace';
const WHITELIST_DOMAIN = [
'w3s.link',
'dweb.link',
'nftstorage.link',
'square.site',
'business.site',
'page.link', // Firebase URL Shortener
'fleek.cool',
'notion.site'
];
const BLACK_TLD = new Set([ const BLACK_TLD = new Set([
'accountant', 'accountant',
'autos', 'autos',
@ -114,20 +103,6 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
return domainSet; return domainSet;
}); });
span.traceChildSync('whitelisting phishing domains', (curSpan) => {
const trieForRemovingWhiteListed = curSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
return curSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
domainSet.delete(white);
domainSet.delete(`.${white}`);
trieForRemovingWhiteListed.substractSetInPlaceFromFound(`.${white}`, domainSet);
}
});
});
const domainCountMap: Record<string, number> = {}; const domainCountMap: Record<string, number> = {};
span.traceChildSync('process phishing domain set', () => { span.traceChildSync('process phishing domain set', () => {

View File

@ -224,8 +224,16 @@ export const PREDEFINED_WHITELIST = [
'email.accounts.bitly.com', // Fuck Peter Lowe Hosts 'email.accounts.bitly.com', // Fuck Peter Lowe Hosts
'adsense.google.com', // Fuck Peter Lowe Hosts 'adsense.google.com', // Fuck Peter Lowe Hosts
'api.vip.miui.com', // Fuck Goodbye Xiaomi Ads 'api.vip.miui.com', // Fuck Goodbye Xiaomi Ads
'stripe.com' // EasyPrivacy only blocks m.stripe.com wwith $third-party, 'stripe.com', // EasyPrivacy only blocks m.stripe.com wwith $third-party,
// yet stupid AdGuardDNSFilter blocks all of it. Stupid AdGuard // yet stupid AdGuardDNSFilter blocks all of it. Stupid AdGuard
'w3s.link', // ipfs gateway
'dweb.link', // ipfs gateway
'nftstorage.link', // ipfs gateway
'fleek.cool', // ipfs gateway
'square.site', // Drag'n'Drop site building platform
'business.site', // Drag'n'Drop site building platform
'page.link', // Firebase URL Shortener
'notion.site'
]; ];
export const PREDEFINED_ENFORCED_WHITELIST = [ export const PREDEFINED_ENFORCED_WHITELIST = [