Rrefactor: rewrite trie in class

This commit is contained in:
SukkaW 2024-10-02 22:01:38 +08:00
parent d1041f0e59
commit a7e7c19a51
7 changed files with 187 additions and 185 deletions

View File

@ -10,8 +10,7 @@ import { processLine } from './lib/process-line';
import { DomainsetOutput } from './lib/create-file';
const getS3OSSDomainsPromise = (async (): Promise<string[]> => {
const trie = createTrie(
(await getPublicSuffixListTextPromise()).reduce<string[]>(
const trie = createTrie((await getPublicSuffixListTextPromise()).reduce<string[]>(
(acc, cur) => {
const tmp = processLine(cur);
if (tmp) {
@ -20,9 +19,7 @@ const getS3OSSDomainsPromise = (async (): Promise<string[]> => {
return acc;
},
[]
),
true
);
));
/**
* Extract OSS domain from publicsuffix list

View File

@ -1,6 +1,6 @@
import { task } from './trace';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import { createTrie } from './lib/trie';
import { HostnameSmolTrie } from './lib/trie';
import { SHARED_DESCRIPTION } from './lib/constants';
import { createMemoizedPromise } from './lib/memo-promise';
import { extractDomainsFromFelixDnsmasq } from './lib/parse-dnsmasq';
@ -27,7 +27,7 @@ const BLACKLIST = [
export const getMicrosoftCdnRulesetPromise = createMemoizedPromise<[domains: string[], domainSuffixes: string[]]>(async () => {
// First trie is to find the microsoft domains that matches probe domains
const trie = createTrie(null, true);
const trie = new HostnameSmolTrie();
for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) {
const domain = extractDomainsFromFelixDnsmasq(line);
if (domain) {
@ -37,8 +37,8 @@ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise<[domains: str
const foundMicrosoftCdnDomains = PROBE_DOMAINS.flatMap(domain => trie.find(domain));
// Second trie is to remove blacklisted domains
const trie2 = createTrie(foundMicrosoftCdnDomains, true);
BLACKLIST.forEach(trie2.whitelist);
const trie2 = new HostnameSmolTrie(foundMicrosoftCdnDomains);
BLACKLIST.forEach(black => trie2.whitelist(black));
const domains: string[] = DOMAINS;
const domainSuffixes: string[] = DOMAIN_SUFFIXES;

View File

@ -191,7 +191,7 @@ async function processPhihsingDomains(domainArr: string[]) {
);
}
export function calcDomainAbuseScore(subdomain: string, fullDomain: string) {
export function calcDomainAbuseScore(subdomain: string, fullDomain: string = subdomain) {
let weight = 0;
const hitLowKeywords = lowKeywords(fullDomain);

View File

@ -56,6 +56,7 @@ describe('Trie', () => {
trie.add('skk.moe');
trie.add('blog.skk.moe');
// eslint-disable-next-line sukka/no-element-overwrite -- deliberately do testing
trie.add('skk.moe');
expect(trie.size).to.equal(2);
@ -63,18 +64,18 @@ describe('Trie', () => {
});
it('should be possible to set the null sequence.', () => {
let trie = createTrie(null, false);
const trie = createTrie(null, false);
trie.add('');
expect(trie.has('')).to.equal(true);
trie = createTrie(null, true);
trie.add('');
expect(trie.has('')).to.equal(true);
const trie2 = createTrie(null, true);
trie2.add('');
expect(trie2.has('')).to.equal(true);
});
it('should be possible to delete items.', () => {
const trie = createTrie(null);
const trie = createTrie(null, false);
trie.add('skk.moe');
trie.add('example.com');
@ -108,7 +109,7 @@ describe('Trie', () => {
});
it('should be possible to retrieve items matching the given prefix.', () => {
const trie = createTrie(null);
const trie = createTrie(null, false);
trie.add('example.com');
trie.add('blog.example.com');
@ -141,12 +142,12 @@ describe('Trie', () => {
});
it('should be possible to create a trie from an arbitrary iterable.', () => {
let trie = createTrie(['skk.moe', 'blog.skk.moe']);
let trie = createTrie(['skk.moe', 'blog.skk.moe'], false);
expect(trie.size).to.equal(2);
expect(trie.has('skk.moe')).to.equal(true);
trie = createTrie(new Set(['skk.moe', 'example.com']));
trie = createTrie(new Set(['skk.moe', 'example.com']), false);
expect(trie.size).to.equal(2);
expect(trie.has('skk.moe')).to.equal(true);
});
@ -154,28 +155,28 @@ describe('Trie', () => {
describe('surge domainset dedupe', () => {
it('should not remove same entry', () => {
const trie = createTrie(['.skk.moe', 'noc.one']);
const trie = createTrie(['.skk.moe', 'noc.one'], false);
expect(trie.find('.skk.moe')).to.deep.equal(['.skk.moe']);
expect(trie.find('noc.one')).to.deep.equal(['noc.one']);
});
it('should match subdomain - 1', () => {
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']);
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], false);
expect(trie.find('.skk.moe')).to.deep.equal(['image.cdn.skk.moe', 'blog.skk.moe']);
expect(trie.find('.sukkaw.com')).to.deep.equal(['www.sukkaw.com']);
});
it('should match subdomain - 2', () => {
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net']);
const trie = createTrie(['www.noc.one', 'www.sukkaw.com', '.skk.moe', 'blog.skk.moe', 'image.cdn.skk.moe', 'cdn.sukkaw.net'], false);
expect(trie.find('.skk.moe')).to.deep.equal(['.skk.moe', 'image.cdn.skk.moe', 'blog.skk.moe']);
expect(trie.find('.sukkaw.com')).to.deep.equal(['www.sukkaw.com']);
});
it('should not remove non-subdomain', () => {
const trie = createTrie(['skk.moe', 'sukkaskk.moe']);
const trie = createTrie(['skk.moe', 'sukkaskk.moe'], false);
expect(trie.find('.skk.moe')).to.deep.equal([]);
});
});

View File

@ -80,16 +80,281 @@ const walkHostnameTokens = (hostname: string, onToken: (token: string) => boolea
return false;
};
export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smolTree = false) => {
let size = 0;
const root: TrieNode<Meta> = createNode();
interface FindSingleChildLeafResult<Meta> {
node: TrieNode<Meta>,
toPrune: TrieNode<Meta> | null,
tokenToPrune: string | null,
parent: TrieNode<Meta>
}
abstract class Triebase<Meta = any> {
protected readonly $root: TrieNode<Meta> = createNode();
protected $size = 0;
get root() {
return this.$root;
}
constructor(from?: string[] | Set<string> | null) {
// Actually build trie
if (Array.isArray(from)) {
for (let i = 0, l = from.length; i < l; i++) {
this.add(from[i]);
}
} else if (from) {
from.forEach((value) => this.add(value));
}
}
public abstract add(suffix: string, meta?: Meta): void;
protected walkIntoLeafWithTokens(
tokens: string[],
onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
) {
let node: TrieNode = this.$root;
let parent: TrieNode = node;
let token: string;
for (let i = tokens.length - 1; i >= 0; i--) {
token = tokens[i];
// if (token === '') {
// break;
// }
parent = node;
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
return null;
}
onLoop(node, parent, token);
}
return { node, parent };
};
protected walkIntoLeafWithSuffix(
suffix: string,
onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
) {
let node: TrieNode = this.$root;
let parent: TrieNode = node;
const onToken = (token: string) => {
if (token === '') {
return true;
}
parent = node;
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
return null;
}
onLoop(node, parent, token);
return false;
};
if (walkHostnameTokens(suffix, onToken) === null) {
return null;
}
return { node, parent };
};
public contains(suffix: string): boolean { return this.walkIntoLeafWithSuffix(suffix) !== null; };
private walk(
onMatches: (suffix: string[], meta: Meta) => void,
initialNode = this.$root,
initialSuffix: string[] = []
) {
const nodeStack: Array<TrieNode<Meta>> = [initialNode];
// Resolving initial string (begin the start of the stack)
const suffixStack: string[][] = [initialSuffix];
let node: TrieNode<Meta> = initialNode;
do {
node = nodeStack.pop()!;
const suffix = suffixStack.pop()!;
node[2].forEach((childNode, k) => {
// Pushing the child node to the stack for next iteration of DFS
nodeStack.push(childNode);
suffixStack.push([k, ...suffix]);
});
// If the node is a sentinel, we push the suffix to the results
if (node[0]) {
onMatches(suffix, node[3]);
}
} while (nodeStack.length);
};
protected getSingleChildLeaf(tokens: string[]): FindSingleChildLeafResult<Meta> | null {
let toPrune: TrieNode | null = null;
let tokenToPrune: string | null = null;
const onLoop = (node: TrieNode, parent: TrieNode, token: string) => {
// Keeping track of a potential branch to prune
// Even if the node size is 1, but the single child is ".", we should retain the branch
// Since the "." could be special if it is the leaf-est node
const onlyChild = node[2].size < 2 && !node[2].has('.');
if (toPrune != null) { // the top-est branch that could potentially being pruned
if (!onlyChild) {
// The branch has moew than single child, retain the branch.
// And we need to abort prune the parent, so we set it to null
toPrune = null;
tokenToPrune = null;
}
} else if (onlyChild) {
// There is only one token child, or no child at all, we can prune it safely
// It is now the top-est branch that could potentially being pruned
toPrune = parent;
tokenToPrune = token;
}
};
const res = this.walkIntoLeafWithTokens(tokens, onLoop);
if (res === null) return null;
return { node: res.node, toPrune, tokenToPrune, parent: res.parent };
};
/**
* Method used to add the given suffix to the trie.
* Method used to retrieve every item in the trie with the given prefix.
*/
const add = smolTree
? (suffix: string, meta?: Meta): void => {
let node: TrieNode<Meta> = root;
public find(
inputSuffix: string,
/** @default true */ includeEqualWithSuffix = true
): string[] {
// if (smolTree) {
// throw new Error('A Trie with smolTree enabled cannot perform find!');
// }
const inputTokens = hostnameToTokens(inputSuffix);
const res = this.walkIntoLeafWithTokens(inputTokens);
if (res === null) return [];
const matches: string[][] = [];
const onMatches = includeEqualWithSuffix
// fast path (default option)
? (suffix: string[]) => matches.push(suffix)
// slow path
: (suffix: string[]) => {
if (!deepEqualArray(suffix, inputTokens)) {
matches.push(suffix);
}
};
this.walk(
onMatches,
res.node, // Performing DFS from prefix
inputTokens
);
return matches.map((m) => fastStringArrayJoin(m, ''));
};
/**
* Method used to delete a prefix from the trie.
*/
public remove(suffix: string): boolean {
const res = this.getSingleChildLeaf(hostnameToTokens(suffix));
if (res === null) return false;
if (!res.node[0]) return false;
this.$size--;
const { node, toPrune, tokenToPrune } = res;
if (tokenToPrune && toPrune) {
toPrune[2].delete(tokenToPrune);
} else {
node[0] = false;
}
return true;
};
// eslint-disable-next-line @typescript-eslint/unbound-method -- alias class methods
public delete = this.remove;
/**
* Method used to assert whether the given prefix exists in the Trie.
*/
public has(suffix: string): boolean {
const res = this.walkIntoLeafWithSuffix(suffix);
return res
? res.node[0]
: false;
};
public dump(onSuffix: (suffix: string) => void): void;
public dump(): string[];
public dump(onSuffix?: (suffix: string) => void): string[] | void {
const results: string[] = [];
const handleSuffix = onSuffix
? (suffix: string[]) => onSuffix(fastStringArrayJoin(suffix, ''))
: (suffix: string[]) => results.push(fastStringArrayJoin(suffix, ''));
this.walk(handleSuffix);
return results;
};
public dumpMeta() {
const results: Meta[] = [];
this.walk((_suffix, meta) => {
results.push(meta);
});
return results;
};
public dumpWithMeta() {
const results: Array<[string, Meta]> = [];
this.walk((suffix, meta) => {
results.push([fastStringArrayJoin(suffix, ''), meta]);
});
return results;
};
public inspect(depth: number, unpackMeta?: (meta?: Meta) => any) {
return fastStringArrayJoin(
JSON.stringify(deepTrieNodeToJSON(this.$root, unpackMeta), null, 2).split('\n').map((line) => ' '.repeat(depth) + line),
'\n'
);
}
public [util.inspect.custom](depth: number) {
return this.inspect(depth);
};
}
export class HostnameSmolTrie<Meta = any> extends Triebase<Meta> {
public smolTree = true;
add(suffix: string, meta?: Meta): void {
let node: TrieNode<Meta> = this.$root;
let curNodeChildren: Map<string, TrieNode<Meta>> = node[2];
const onToken = (token: string) => {
@ -136,274 +401,10 @@ export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smo
node[0] = true;
node[3] = meta!;
}
: (suffix: string, meta?: Meta): void => {
let node: TrieNode<Meta> = root;
const onToken = (token: string) => {
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
const newNode = createNode(node);
node[2].set(token, newNode);
node = newNode;
}
return false;
};
// When walkHostnameTokens returns true, we should skip the rest
if (walkHostnameTokens(suffix, onToken)) {
return;
}
if (!node[0]) {
size++;
node[0] = true;
node[3] = meta!;
}
};
const walkIntoLeafWithTokens = (
tokens: string[],
onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
) => {
let node: TrieNode = root;
let parent: TrieNode = node;
let token: string;
for (let i = tokens.length - 1; i >= 0; i--) {
token = tokens[i];
// if (token === '') {
// break;
// }
parent = node;
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
return null;
}
onLoop(node, parent, token);
}
return { node, parent };
};
const walkIntoLeafWithSuffix = (
suffix: string,
onLoop: (node: TrieNode, parent: TrieNode, token: string) => void = noop
) => {
let node: TrieNode = root;
let parent: TrieNode = node;
const onToken = (token: string) => {
if (token === '') {
return true;
}
parent = node;
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
return null;
}
onLoop(node, parent, token);
return false;
};
if (walkHostnameTokens(suffix, onToken) === null) {
return null;
}
return { node, parent };
};
const contains = (suffix: string): boolean => walkIntoLeafWithSuffix(suffix) !== null;
const walk = (
onMatches: (suffix: string[], meta: Meta) => void,
initialNode = root,
initialSuffix: string[] = []
) => {
const nodeStack: Array<TrieNode<Meta>> = [initialNode];
// Resolving initial string (begin the start of the stack)
const suffixStack: string[][] = [initialSuffix];
let node: TrieNode<Meta> = root;
do {
node = nodeStack.pop()!;
const suffix = suffixStack.pop()!;
node[2].forEach((childNode, k) => {
// Pushing the child node to the stack for next iteration of DFS
nodeStack.push(childNode);
suffixStack.push([k, ...suffix]);
});
// If the node is a sentinel, we push the suffix to the results
if (node[0]) {
onMatches(suffix, node[3]);
}
} while (nodeStack.length);
};
interface FindSingleChildLeafResult {
node: TrieNode,
toPrune: TrieNode | null,
tokenToPrune: string | null,
parent: TrieNode
}
const getSingleChildLeaf = (tokens: string[]): FindSingleChildLeafResult | null => {
let toPrune: TrieNode | null = null;
let tokenToPrune: string | null = null;
const onLoop = (node: TrieNode, parent: TrieNode, token: string) => {
// Keeping track of a potential branch to prune
// Even if the node size is 1, but the single child is ".", we should retain the branch
// Since the "." could be special if it is the leaf-est node
const onlyChild = node[2].size < 2 && !node[2].has('.');
if (toPrune != null) { // the top-est branch that could potentially being pruned
if (!onlyChild) {
// The branch has moew than single child, retain the branch.
// And we need to abort prune the parent, so we set it to null
toPrune = null;
tokenToPrune = null;
}
} else if (onlyChild) {
// There is only one token child, or no child at all, we can prune it safely
// It is now the top-est branch that could potentially being pruned
toPrune = parent;
tokenToPrune = token;
}
};
const res = walkIntoLeafWithTokens(tokens, onLoop);
if (res === null) return null;
return { node: res.node, toPrune, tokenToPrune, parent: res.parent };
};
/**
* Method used to retrieve every item in the trie with the given prefix.
*/
const find = (
inputSuffix: string,
/** @default true */ includeEqualWithSuffix = true
): string[] => {
// if (smolTree) {
// throw new Error('A Trie with smolTree enabled cannot perform find!');
// }
const inputTokens = hostnameToTokens(inputSuffix);
const res = walkIntoLeafWithTokens(inputTokens);
if (res === null) return [];
const matches: string[][] = [];
const onMatches = includeEqualWithSuffix
// fast path (default option)
? (suffix: string[]) => matches.push(suffix)
// slow path
: (suffix: string[]) => {
if (!deepEqualArray(suffix, inputTokens)) {
matches.push(suffix);
}
};
walk(
onMatches,
res.node, // Performing DFS from prefix
inputTokens
);
return matches.map((m) => fastStringArrayJoin(m, ''));
};
/**
* Method used to delete a prefix from the trie.
*/
const remove = (suffix: string): boolean => {
const res = getSingleChildLeaf(hostnameToTokens(suffix));
if (res === null) return false;
if (!res.node[0]) return false;
size--;
const { node, toPrune, tokenToPrune } = res;
if (tokenToPrune && toPrune) {
toPrune[2].delete(tokenToPrune);
} else {
node[0] = false;
}
return true;
};
/**
* Method used to assert whether the given prefix exists in the Trie.
*/
const has = (suffix: string): boolean => {
const res = walkIntoLeafWithSuffix(suffix);
return res
? res.node[0]
: false;
};
function dump(onSuffix: (suffix: string) => void): void;
function dump(): string[];
function dump(onSuffix?: (suffix: string) => void): string[] | void {
const results: string[] = [];
const handleSuffix = onSuffix
? (suffix: string[]) => onSuffix(fastStringArrayJoin(suffix, ''))
: (suffix: string[]) => results.push(fastStringArrayJoin(suffix, ''));
walk(handleSuffix);
return results;
};
const dumpMeta = () => {
const results: Meta[] = [];
walk((suffix, meta) => {
results.push(meta);
});
return results;
};
const dumpWithMeta = () => {
const results: Array<[string, Meta]> = [];
walk((suffix, meta) => {
results.push([fastStringArrayJoin(suffix, ''), meta]);
});
return results;
};
const whitelist = (suffix: string) => {
if (!smolTree) {
throw new Error('whitelist method is only available in smolTree mode.');
}
public whitelist(suffix: string) {
const tokens = hostnameToTokens(suffix);
const res = getSingleChildLeaf(tokens);
const res = this.getSingleChildLeaf(tokens);
if (res === null) return;
@ -433,45 +434,48 @@ export const createTrie = <Meta = any>(from?: string[] | Set<string> | null, smo
node[0] = false;
}
};
}
// Actually build trie
if (Array.isArray(from)) {
for (let i = 0, l = from.length; i < l; i++) {
add(from[i]);
}
} else if (from) {
from.forEach((value) => add(value));
}
const inspect = (depth: number, unpackMeta?: (meta?: Meta) => any) => fastStringArrayJoin(
JSON.stringify(deepTrieNodeToJSON(root, unpackMeta), null, 2).split('\n').map((line) => ' '.repeat(depth) + line),
'\n'
);
return {
add,
contains,
find,
remove,
delete: remove,
has,
dump,
dumpMeta,
dumpWithMeta,
export class HostnameTrie<Meta = any> extends Triebase<Meta> {
get size() {
if (smolTree) {
throw new Error('A Trie with smolTree enabled cannot have correct size!');
return this.$size;
}
return size;
},
get root() {
return root;
},
whitelist,
inspect,
[util.inspect.custom]: inspect,
smolTree
add(suffix: string, meta?: Meta): void {
let node: TrieNode<Meta> = this.$root;
const onToken = (token: string) => {
if (node[2].has(token)) {
node = node[2].get(token)!;
} else {
const newNode = createNode(node);
node[2].set(token, newNode);
node = newNode;
}
return false;
};
// When walkHostnameTokens returns true, we should skip the rest
if (walkHostnameTokens(suffix, onToken)) {
return;
}
if (!node[0]) {
this.$size++;
node[0] = true;
node[3] = meta!;
}
}
}
export function createTrie<Meta = any>(from: string[] | Set<string> | null, smolTree: true): HostnameSmolTrie<Meta>;
export function createTrie<Meta = any>(from?: string[] | Set<string> | null, smolTree?: false): HostnameTrie<Meta>;
export function createTrie<_Meta = any>(from?: string[] | Set<string> | null, smolTree = true) {
if (smolTree) {
return new HostnameSmolTrie(from);
}
return new HostnameTrie(from);
};
export type Trie = ReturnType<typeof createTrie>;

View File

@ -7,7 +7,7 @@ import { parseFelixDnsmasq } from './lib/parse-dnsmasq';
import { SOURCE_DIR } from './constants/dir';
export const parseDomesticList = async () => {
const trie = createTrie(await parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'), true);
const trie = createTrie(await parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'));
const top5000 = new Set<string>();

View File

@ -76,7 +76,7 @@ export const parseGfwList = async () => {
})).text();
const topDomains = parse(res);
const trie = createTrie(blackSet, true);
const trie = createTrie(blackSet);
for await (const [domain] of topDomains) {
if (trie.has(domain)) {