mirror of
https://github.com/SukkaW/Surge.git
synced 2025-12-12 01:00:34 +08:00
Implement keyword deduper using AhoCorasick
This commit is contained in:
parent
bddb164589
commit
dcf565fb6b
@ -4,7 +4,7 @@ const fse = require('fs-extra');
|
||||
const readline = require('readline');
|
||||
|
||||
const { resolve: pathResolve } = require('path');
|
||||
const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter');
|
||||
const { processHosts, processFilterRules } = require('./lib/parse-filter');
|
||||
const { getDomain } = require('tldts');
|
||||
const Trie = require('./lib/trie');
|
||||
|
||||
@ -12,6 +12,8 @@ const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLI
|
||||
const { withBannerArray } = require('./lib/with-banner');
|
||||
const { compareAndWriteFile } = require('./lib/string-array-compare');
|
||||
const { processLine } = require('./lib/process-line');
|
||||
const { domainDeduper } = require('./lib/domain-deduper');
|
||||
const createKeywordFilter = require('./lib/aho-corasick');
|
||||
|
||||
/** Whitelists */
|
||||
const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST);
|
||||
@ -151,6 +153,8 @@ const domainSuffixSet = new Set();
|
||||
console.log(`Start deduping from black keywords/suffixes! (${previousSize})`);
|
||||
console.time('* Dedupe from black keywords/suffixes');
|
||||
|
||||
const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet));
|
||||
|
||||
const trie1 = Trie.from(Array.from(domainSets));
|
||||
domainSuffixSet.forEach(suffix => {
|
||||
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
|
||||
@ -173,7 +177,7 @@ const domainSuffixSet = new Set();
|
||||
}
|
||||
|
||||
// Remove keyword
|
||||
if (isMatchKeyword(domain)) {
|
||||
if (kwfilter.search(domain)) {
|
||||
domainSets.delete(domain);
|
||||
}
|
||||
}
|
||||
@ -187,28 +191,10 @@ const domainSuffixSet = new Set();
|
||||
|
||||
const START_TIME = Date.now();
|
||||
|
||||
const domainSetsArray = Array.from(domainSets);
|
||||
const trie2 = Trie.from(domainSetsArray);
|
||||
const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray);
|
||||
console.log(fullsetDomainStartsWithADot.length);
|
||||
|
||||
for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) {
|
||||
const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j];
|
||||
const found = trie2.find(domainStartsWithADotAndFromFullSet, false);
|
||||
if (found.length) {
|
||||
found.forEach(f => {
|
||||
domainSets.delete(f);
|
||||
});
|
||||
}
|
||||
|
||||
const a = domainStartsWithADotAndFromFullSet.slice(1);
|
||||
if (trie2.has(a)) {
|
||||
domainSets.delete(a);
|
||||
}
|
||||
}
|
||||
const dudupedDominArray = domainDeduper(Array.from(domainSets));
|
||||
|
||||
console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`);
|
||||
console.log(`Deduped ${previousSize - domainSets.size} rules!`);
|
||||
console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`);
|
||||
|
||||
console.time('* Write reject.conf');
|
||||
|
||||
@ -221,7 +207,7 @@ const domainSuffixSet = new Set();
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
const sortedDomainSets = Array.from(domainSets)
|
||||
const sortedDomainSets = dudupedDominArray
|
||||
.map((v) => {
|
||||
return { v, domain: getDomain(v.charCodeAt(0) === 46 ? v.slice(1) : v) || v };
|
||||
})
|
||||
@ -255,16 +241,3 @@ const domainSuffixSet = new Set();
|
||||
|
||||
console.timeEnd('Total Time - build-reject-domain-set');
|
||||
})();
|
||||
|
||||
/**
|
||||
* @param {string} domain
|
||||
*/
|
||||
function isMatchKeyword(domain) {
|
||||
for (const keyword of domainKeywordsSet) {
|
||||
if (domain.includes(keyword)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
139
Build/lib/aho-corasick.js
Normal file
139
Build/lib/aho-corasick.js
Normal file
@ -0,0 +1,139 @@
|
||||
/**
|
||||
* @typedef {Object} Node
|
||||
* @prop {number} [depth = 0]
|
||||
* @prop {string} key
|
||||
* @prop {boolean} [word = false]
|
||||
* @prop {Record<string, Node>} [children={}]
|
||||
* @prop {Node} [fail]
|
||||
* @prop {number} [count=0]
|
||||
*/
|
||||
|
||||
/**
|
||||
* @param {string} key
|
||||
* @param {number} depth
|
||||
* @returns {Node}
|
||||
*/
|
||||
const createNode = (key, depth = 0) => ({
|
||||
depth,
|
||||
key,
|
||||
word: false,
|
||||
children: {},
|
||||
fail: undefined,
|
||||
count: 0
|
||||
});
|
||||
|
||||
/**
|
||||
* @param {string[]} keys
|
||||
*/
|
||||
const createKeywordFilter = (keys) => {
|
||||
const root = createNode('root');
|
||||
|
||||
const build = () => {
|
||||
/** @type {Node[]} */
|
||||
const queue = [];
|
||||
queue.push(root);
|
||||
|
||||
let idx = 0;
|
||||
while (queue.length > idx) {
|
||||
const beginNode = queue[idx];
|
||||
const map = beginNode.children;
|
||||
// eslint-disable-next-line guard-for-in -- plain object
|
||||
for (const key in beginNode.children) {
|
||||
const node = map[key];
|
||||
let failNode = beginNode.fail;
|
||||
|
||||
while (failNode && !failNode.children[key]) {
|
||||
failNode = failNode.fail;
|
||||
}
|
||||
|
||||
node.fail = failNode?.children[key] || root;
|
||||
|
||||
queue.push(node);
|
||||
}
|
||||
|
||||
idx++;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {string} key
|
||||
* @param {number} len
|
||||
*/
|
||||
const put = (key, len) => {
|
||||
let node = root;
|
||||
const lastIdx = len - 1;
|
||||
node.count++;
|
||||
for (let idx = 0; idx < len; idx++) {
|
||||
const val = key[idx];
|
||||
const nextNode = node.children[val];
|
||||
|
||||
if (nextNode) {
|
||||
nextNode.count++;
|
||||
node = nextNode;
|
||||
} else {
|
||||
const newNode = createNode(val, idx + 1);
|
||||
newNode.count = 1;
|
||||
node.children[val] = newNode;
|
||||
node = newNode;
|
||||
}
|
||||
|
||||
if (lastIdx === idx && node.depth) {
|
||||
node.word = true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @param {string} key
|
||||
*/
|
||||
const add = (key) => {
|
||||
const len = key.length;
|
||||
put(key, len);
|
||||
build();
|
||||
|
||||
return true;
|
||||
};
|
||||
|
||||
for (let idx = 0; idx < keys.length; idx++) {
|
||||
add(keys[idx], false);
|
||||
}
|
||||
|
||||
build();
|
||||
|
||||
/**
|
||||
* @param {string} text
|
||||
* @returns {boolean}
|
||||
*/
|
||||
const search = (text) => {
|
||||
let node = root;
|
||||
/** @type {string[]} */
|
||||
const fText = [];
|
||||
/** @type {string[]} */
|
||||
const oText = [];
|
||||
|
||||
for (let i = 0, textLen = text.length; i < textLen; i++) {
|
||||
// const key = text.charAt(i);
|
||||
const key = text[i];
|
||||
|
||||
while (node && !node?.children[key]) {
|
||||
node = node?.fail;
|
||||
}
|
||||
node = node?.children[key] || root;
|
||||
|
||||
fText.push(key);
|
||||
oText.push(key);
|
||||
|
||||
if (node.word) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
};
|
||||
|
||||
return {
|
||||
search
|
||||
};
|
||||
};
|
||||
|
||||
module.exports = createKeywordFilter;
|
||||
@ -240,7 +240,8 @@ const PREDEFINED_ENFORCED_WHITELIST = [
|
||||
'ipfs.fleek.cool',
|
||||
'repl.co',
|
||||
'w3s.link',
|
||||
'translate.goog'
|
||||
'translate.goog',
|
||||
'backblazeb2.com'
|
||||
];
|
||||
|
||||
module.exports.HOSTS = HOSTS;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user