Implement keyword deduper using AhoCorasick

This commit is contained in:
SukkaW 2023-07-22 21:39:39 +08:00
parent bddb164589
commit dcf565fb6b
3 changed files with 150 additions and 37 deletions

View File

@ -4,7 +4,7 @@ const fse = require('fs-extra');
const readline = require('readline'); const readline = require('readline');
const { resolve: pathResolve } = require('path'); const { resolve: pathResolve } = require('path');
const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter'); const { processHosts, processFilterRules } = require('./lib/parse-filter');
const { getDomain } = require('tldts'); const { getDomain } = require('tldts');
const Trie = require('./lib/trie'); const Trie = require('./lib/trie');
@ -12,6 +12,8 @@ const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLI
const { withBannerArray } = require('./lib/with-banner'); const { withBannerArray } = require('./lib/with-banner');
const { compareAndWriteFile } = require('./lib/string-array-compare'); const { compareAndWriteFile } = require('./lib/string-array-compare');
const { processLine } = require('./lib/process-line'); const { processLine } = require('./lib/process-line');
const { domainDeduper } = require('./lib/domain-deduper');
const createKeywordFilter = require('./lib/aho-corasick');
/** Whitelists */ /** Whitelists */
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
@ -151,6 +153,8 @@ const domainSuffixSet = new Set();
console.log(`Start deduping from black keywords/suffixes! (${previousSize})`); console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
console.time('* Dedupe from black keywords/suffixes'); console.time('* Dedupe from black keywords/suffixes');
const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet));
const trie1 = Trie.from(Array.from(domainSets)); const trie1 = Trie.from(Array.from(domainSets));
domainSuffixSet.forEach(suffix => { domainSuffixSet.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f)); trie1.find(suffix, true).forEach(f => domainSets.delete(f));
@ -173,7 +177,7 @@ const domainSuffixSet = new Set();
} }
// Remove keyword // Remove keyword
if (isMatchKeyword(domain)) { if (kwfilter.search(domain)) {
domainSets.delete(domain); domainSets.delete(domain);
} }
} }
@ -187,28 +191,10 @@ const domainSuffixSet = new Set();
const START_TIME = Date.now(); const START_TIME = Date.now();
const domainSetsArray = Array.from(domainSets); const dudupedDominArray = domainDeduper(Array.from(domainSets));
const trie2 = Trie.from(domainSetsArray);
const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
console.log(fullsetDomainStartsWithADot.length);
for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) {
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
const found = trie2.find(domainStartsWithADotAndFromFullSet, false);
if (found.length) {
found.forEach(f => {
domainSets.delete(f);
});
}
const a = domainStartsWithADotAndFromFullSet.slice(1);
if (trie2.has(a)) {
domainSets.delete(a);
}
}
console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`); console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
console.log(`Deduped ${previousSize - domainSets.size} rules!`); console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);
console.time('* Write reject.conf'); console.time('* Write reject.conf');
@ -221,7 +207,7 @@ const domainSuffixSet = new Set();
} }
return 0; return 0;
}; };
const sortedDomainSets = Array.from(domainSets) const sortedDomainSets = dudupedDominArray
.map((v) => { .map((v) => {
return { v, domain: getDomain(v.charCodeAt(0) === 46 ? v.slice(1) : v) || v }; return { v, domain: getDomain(v.charCodeAt(0) === 46 ? v.slice(1) : v) || v };
}) })
@ -255,16 +241,3 @@ const domainSuffixSet = new Set();
console.timeEnd('Total Time - build-reject-domain-set'); console.timeEnd('Total Time - build-reject-domain-set');
})(); })();
/**
* @param {string} domain
*/
function isMatchKeyword(domain) {
for (const keyword of domainKeywordsSet) {
if (domain.includes(keyword)) {
return true;
}
}
return false;
}

139
Build/lib/aho-corasick.js Normal file
View File

@ -0,0 +1,139 @@
/**
* @typedef {Object} Node
* @prop {number} [depth = 0]
* @prop {string} key
* @prop {boolean} [word = false]
* @prop {Record<string, Node>} [children={}]
* @prop {Node} [fail]
* @prop {number} [count=0]
*/
/**
* @param {string} key
* @param {number} depth
* @returns {Node}
*/
const createNode = (key, depth = 0) => ({
depth,
key,
word: false,
children: {},
fail: undefined,
count: 0
});
/**
* @param {string[]} keys
*/
const createKeywordFilter = (keys) => {
const root = createNode('root');
const build = () => {
/** @type {Node[]} */
const queue = [];
queue.push(root);
let idx = 0;
while (queue.length > idx) {
const beginNode = queue[idx];
const map = beginNode.children;
// eslint-disable-next-line guard-for-in -- plain object
for (const key in beginNode.children) {
const node = map[key];
let failNode = beginNode.fail;
while (failNode && !failNode.children[key]) {
failNode = failNode.fail;
}
node.fail = failNode?.children[key] || root;
queue.push(node);
}
idx++;
}
};
/**
* @param {string} key
* @param {number} len
*/
const put = (key, len) => {
let node = root;
const lastIdx = len - 1;
node.count++;
for (let idx = 0; idx < len; idx++) {
const val = key[idx];
const nextNode = node.children[val];
if (nextNode) {
nextNode.count++;
node = nextNode;
} else {
const newNode = createNode(val, idx + 1);
newNode.count = 1;
node.children[val] = newNode;
node = newNode;
}
if (lastIdx === idx && node.depth) {
node.word = true;
}
}
};
/**
* @param {string} key
*/
const add = (key) => {
const len = key.length;
put(key, len);
build();
return true;
};
for (let idx = 0; idx < keys.length; idx++) {
add(keys[idx], false);
}
build();
/**
* @param {string} text
* @returns {boolean}
*/
const search = (text) => {
let node = root;
/** @type {string[]} */
const fText = [];
/** @type {string[]} */
const oText = [];
for (let i = 0, textLen = text.length; i < textLen; i++) {
// const key = text.charAt(i);
const key = text[i];
while (node && !node?.children[key]) {
node = node?.fail;
}
node = node?.children[key] || root;
fText.push(key);
oText.push(key);
if (node.word) {
return true;
}
}
return false;
};
return {
search
};
};
module.exports = createKeywordFilter;

View File

@ -240,7 +240,8 @@ const PREDEFINED_ENFORCED_WHITELIST = [
'ipfs.fleek.cool', 'ipfs.fleek.cool',
'repl.co', 'repl.co',
'w3s.link', 'w3s.link',
'translate.goog' 'translate.goog',
'backblazeb2.com'
]; ];
module.exports.HOSTS = HOSTS; module.exports.HOSTS = HOSTS;