Improve keyword filter performance

This commit is contained in:
SukkaW
2024-06-02 15:58:23 +08:00
parent 2b1a8d1394
commit 95de88c45e
2 changed files with 84 additions and 43 deletions

View File

@@ -4,9 +4,19 @@ import createKeywordFilter from './aho-corasick';
describe('AhoCorasick', () => { describe('AhoCorasick', () => {
it('basic', () => { it('basic', () => {
const kwfilter = createKeywordFilter(['ap', 'an']); let kwfilter = createKeywordFilter(['ap', 'an']);
expect(kwfilter('bananan')).toBeTrue(); expect(kwfilter('bananan')).toBeTrue();
expect(kwfilter('apple')).toBeTrue(); expect(kwfilter('apple')).toBeTrue();
expect(kwfilter('melon')).toBeFalse(); expect(kwfilter('melon')).toBeFalse();
console.log(kwfilter);
kwfilter = createKeywordFilter(['cdn', 'sukka']);
expect(kwfilter('bananan')).toBeFalse();
expect(kwfilter('apple')).toBeFalse();
expect(kwfilter('melon')).toBeFalse();
console.log(kwfilter);
console.log(createKeywordFilter(['skk.moe', 'anotherskk', 'skk.com']));
}); });
}); });

View File

@@ -1,89 +1,120 @@
interface Node { const WORDEND = Symbol('wordEnd');
/** @default false */ const FAIL = Symbol('fail');
wordEnd: boolean,
children: Map<string, Node | undefined>,
fail: Node | undefined
}
const createNode = (): Node => ({ type Node = Map<string, Node> & {
wordEnd: false, [WORDEND]: boolean,
children: new Map(), [FAIL]: Node | undefined
fail: undefined };
});
const createNode = (): Node => {
const node = new Map<string, Node | undefined>() as Node;
node[WORDEND] = false;
node[FAIL] = undefined;
return node;
};
const deepNodeToJSON = (node: Node, wset: WeakSet<Node>) => {
if (wset.has(node)) {
return 'circular';
}
wset.add(node);
const obj: Record<string, any> = {};
if (node[WORDEND]) {
obj['[end]'] = node[WORDEND];
}
node.forEach((value, key) => {
obj[key] = deepNodeToJSON(value, wset);
});
return obj;
};
function createNodeInspectCustom(node: Node) {
const wset = new WeakSet<Node>();
return () => {
try {
return JSON.stringify(deepNodeToJSON(node, wset), null, 2);
} catch (e) {
console.error(e);
return '';
}
};
}
const createKeywordFilter = (keys: string[] | Set<string>) => { const createKeywordFilter = (keys: string[] | Set<string>) => {
const root = createNode(); const root = createNode();
const put = (key: string, len = key.length) => { // Create a trie with extra fields and information
const put = (key: string) => {
const len = key.length;
let node = root; let node = root;
const lastIdx = len - 1;
for (let idx = 0; idx < len; idx++) { for (let idx = 0; idx < len; idx++) {
const char = key[idx]; const char = key[idx];
if (node.children.has(char)) { if (node.has(char)) {
node = node.children.get(char)!; node = node.get(char)!;
} else { } else {
const newNode = createNode(); const newNode = createNode();
node.children.set(char, newNode); node.set(char, newNode);
node = newNode; node = newNode;
} }
}
if (lastIdx === idx && node !== root) { // If a new node is created, mark it as a word end when loop finish
node.wordEnd = true; if (node !== root) {
} node[WORDEND] = true;
} }
}; };
keys.forEach(k => put(k)); keys.forEach(put);
// const build = () => { // const build = () => {
const queue: Node[] = []; const queue: Node[] = [root];
queue.push(root);
let idx = 0; while (queue.length) {
while (queue.length > idx) { const beginNode = queue.pop()!;
const beginNode = queue[idx];
const children = beginNode.children;
children.forEach((node, char) => { beginNode.forEach((node, char) => {
let failNode = beginNode.fail; let failNode = beginNode[FAIL];
while (failNode && !failNode.children.has(char)) { while (failNode && !failNode.has(char)) {
failNode = failNode.fail; failNode = failNode[FAIL];
} }
if (node) { node[FAIL] = failNode ? failNode.get(char) : root;
node.fail = failNode?.children.get(char) || root;
queue.push(node); queue.push(node);
}
}); });
idx++;
} }
// }; // };
// build(); // build();
return (text: string) => { const tester = (text: string) => {
let node: Node | undefined = root; let node: Node | undefined = root;
for (let i = 0, textLen = text.length; i < textLen; i++) { for (let i = 0, textLen = text.length; i < textLen; i++) {
// const key = text.charAt(i);
const char = text[i]; const char = text[i];
while (node && !node.children.has(char)) { while (node && !node.has(char)) {
node = node.fail; node = node[FAIL];
} }
node = node?.children.get(char) || root;
if (node.wordEnd) { node = node ? node.get(char)! : root;
if (node[WORDEND]) {
return true; return true;
} }
} }
return false; return false;
}; };
tester[Bun.inspect.custom] = createNodeInspectCustom(root);
return tester;
}; };
export default createKeywordFilter; export default createKeywordFilter;