Fix/Perf: more efficient and correct whitelisting

This commit is contained in:
SukkaW 2024-05-27 01:09:11 +08:00
parent 8b1eeb1c14
commit 2f329a4144
4 changed files with 52 additions and 35 deletions

View File

@ -94,9 +94,6 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
} }
}); });
// Remove as many domains as possible from domainSets before creating trie
SetSubstract(domainSets, filterRuleWhitelistDomainSets);
// Perform kwfilter to remove as many domains as possible from domainSets before creating trie // Perform kwfilter to remove as many domains as possible from domainSets before creating trie
childSpan.traceChildSync('dedupe from black keywords', () => { childSpan.traceChildSync('dedupe from black keywords', () => {
const kwfilter = createKeywordFilter(domainKeywordsSet); const kwfilter = createKeywordFilter(domainKeywordsSet);
@ -110,11 +107,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => {
}); });
}); });
const trie = createTrie(domainSets, true, true); const trie = span.traceChildSync('dedupe from white suffixes', () => {
span.traceChildSync('dedupe from white suffixes', () => { const trie = createTrie(domainSets, true, true);
filterRuleWhitelistDomainSets.forEach(suffix => { filterRuleWhitelistDomainSets.forEach(suffix => {
trie.whitelist(suffix); trie.whitelist(suffix);
}); });
return trie;
}); });
// Dedupe domainSets // Dedupe domainSets

View File

@ -251,7 +251,7 @@ export const PREDEFINED_WHITELIST = [
'business.site', // Drag'n'Drop site building platform 'business.site', // Drag'n'Drop site building platform
'page.link', // Firebase URL Shortener 'page.link', // Firebase URL Shortener
'notion.site' 'notion.site'
]; ].map(suffix => `.${suffix}`);
export const PREDEFINED_ENFORCED_WHITELIST = [ export const PREDEFINED_ENFORCED_WHITELIST = [
'r2.dev', 'r2.dev',

View File

@ -208,6 +208,26 @@ describe('smol tree', () => {
]); ]);
}); });
it('should whitelist trie correctly', () => {
const trie = createTrie([
'.t.co',
't.co',
'example.t.co',
'.skk.moe'
], true, true);
expect(trie.dump()).toStrictEqual([
'.skk.moe',
'.t.co'
]);
trie.whitelist('.t.co');
expect(trie.dump()).toStrictEqual(['.skk.moe']);
trie.whitelist('skk.moe');
expect(trie.dump()).toStrictEqual([]);
});
it('should efficiently whitelist domains', () => { it('should efficiently whitelist domains', () => {
const trie = createTrie([ const trie = createTrie([
'skk.moe', 'skk.moe',
@ -231,7 +251,6 @@ describe('smol tree', () => {
]); ]);
trie.whitelist('anotherskk.moe'); trie.whitelist('anotherskk.moe');
expect(trie.dump()).toStrictEqual([ expect(trie.dump()).toStrictEqual([
'blog.anotherskk.moe' 'blog.anotherskk.moe'
]); ]);

View File

@ -370,27 +370,11 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
parent = node; parent = node;
node = node.get(token); node = node.get(token);
if (!node) { if (!node) return;
return;
}
// Keeping track of a potential branch to prune
// If the node is to be pruned, but they are more than one token child in it, we can't prune it
// If there is only one token child, or no child at all, we can prune it safely
const onlyChild = node.size === 1 && node.has(token);
if (onlyChild) {
toPrune = parent;
tokenToPrune = token;
} else if (toPrune !== null) { // not only child, retain the branch
toPrune = null;
tokenToPrune = null;
}
// During the whitelist of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie // During the whitelist of `[start]blog.skk.moe` and find out that there is a `[start].skk.moe` in the trie
// Dedupe the covered subdomain by skipping // Dedupe the covered subdomain by skipping
if (node.get('.')?.[SENTINEL]) { if (i > 1 && node.get('.')?.[SENTINEL] === true) {
return; return;
} }
@ -399,21 +383,35 @@ export const createTrie = (from?: string[] | Set<string> | null, hostnameMode =
// If there is a `[start]sub.example.com` here, remove it // If there is a `[start]sub.example.com` here, remove it
node[SENTINEL] = false; node[SENTINEL] = false;
// Removing the rest of the child nodes by creating a new node and disconnecting the old one // Removing all the child nodes by disconnecting "."
const newNode = createNode(node); node.delete('.');
node.set('.', newNode); } else if (i === 0) {
node = newNode; // Trying to whitelist `example.com` when there is already a `.example.com` in the trie
break; const dotNode = node.get('.');
} if (dotNode?.[SENTINEL] === true) {
if (i === 0) { dotNode[SENTINEL] = false;
// Trying to add `example.com` when there is already a `.example.com` in the trie
if (node.get('.')?.[SENTINEL] === true) {
return;
} }
} }
// Keeping track of a potential branch to prune
// If the node is to be pruned, but they are more than one token child in it, we can't prune it
// If there is only one token child, or no child at all, we can prune it safely
if (toPrune != null) { // the first branch that could potentially being pruned
if (node.size > 1 || node.has('.')) {
// not only child, retain the branch.
// And we need to abort prune the parent, so we set it to null
toPrune = null;
tokenToPrune = null;
}
} else if (node.size < 2 && !node.has('.')) {
toPrune = parent;
tokenToPrune = token;
}
} }
if (!node[SENTINEL]) return false; if (!node[SENTINEL]) return false;
if (tokenToPrune && toPrune) { if (tokenToPrune && toPrune) {
toPrune.delete(tokenToPrune); toPrune.delete(tokenToPrune);
} else { } else {