diff --git a/Build/build-reject-domainset.js b/Build/build-reject-domainset.js index 7e588359..335ccefe 100644 --- a/Build/build-reject-domainset.js +++ b/Build/build-reject-domainset.js @@ -4,7 +4,7 @@ const fse = require('fs-extra'); const readline = require('readline'); const { resolve: pathResolve } = require('path'); -const { processHosts, processFilterRules, preprocessFullDomainSetBeforeUsedAsWorkerData } = require('./lib/parse-filter'); +const { processHosts, processFilterRules } = require('./lib/parse-filter'); const { getDomain } = require('tldts'); const Trie = require('./lib/trie'); @@ -12,6 +12,8 @@ const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, PREDEFINED_ENFORCED_BACKLI const { withBannerArray } = require('./lib/with-banner'); const { compareAndWriteFile } = require('./lib/string-array-compare'); const { processLine } = require('./lib/process-line'); +const { domainDeduper } = require('./lib/domain-deduper'); +const createKeywordFilter = require('./lib/aho-corasick'); /** Whitelists */ const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); @@ -151,6 +153,8 @@ const domainSuffixSet = new Set(); console.log(`Start deduping from black keywords/suffixes! (${previousSize})`); console.time('* Dedupe from black keywords/suffixes'); + const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet)); + const trie1 = Trie.from(Array.from(domainSets)); domainSuffixSet.forEach(suffix => { trie1.find(suffix, true).forEach(f => domainSets.delete(f)); @@ -173,7 +177,7 @@ const domainSuffixSet = new Set(); } // Remove keyword - if (isMatchKeyword(domain)) { + if (kwfilter.search(domain)) { domainSets.delete(domain); } } @@ -187,28 +191,10 @@ const domainSuffixSet = new Set(); const START_TIME = Date.now(); - const domainSetsArray = Array.from(domainSets); - const trie2 = Trie.from(domainSetsArray); - const fullsetDomainStartsWithADot = preprocessFullDomainSetBeforeUsedAsWorkerData(domainSetsArray); - console.log(fullsetDomainStartsWithADot.length); - - for (let j = 0, len = fullsetDomainStartsWithADot.length; j < len; j++) { - const domainStartsWithADotAndFromFullSet = fullsetDomainStartsWithADot[j]; - const found = trie2.find(domainStartsWithADotAndFromFullSet, false); - if (found.length) { - found.forEach(f => { - domainSets.delete(f); - }); - } - - const a = domainStartsWithADotAndFromFullSet.slice(1); - if (trie2.has(a)) { - domainSets.delete(a); - } - } + const dudupedDominArray = domainDeduper(Array.from(domainSets)); console.log(`* Dedupe from covered subdomain - ${(Date.now() - START_TIME) / 1000}s`); - console.log(`Deduped ${previousSize - domainSets.size} rules!`); + console.log(`Deduped ${previousSize - dudupedDominArray.length} rules!`); console.time('* Write reject.conf'); @@ -221,7 +207,7 @@ const domainSuffixSet = new Set(); } return 0; }; - const sortedDomainSets = Array.from(domainSets) + const sortedDomainSets = dudupedDominArray .map((v) => { return { v, domain: getDomain(v.charCodeAt(0) === 46 ? v.slice(1) : v) || v }; }) @@ -255,16 +241,3 @@ const domainSuffixSet = new Set(); console.timeEnd('Total Time - build-reject-domain-set'); })(); - -/** - * @param {string} domain - */ -function isMatchKeyword(domain) { - for (const keyword of domainKeywordsSet) { - if (domain.includes(keyword)) { - return true; - } - } - - return false; -} diff --git a/Build/lib/aho-corasick.js b/Build/lib/aho-corasick.js new file mode 100644 index 00000000..c1da8064 --- /dev/null +++ b/Build/lib/aho-corasick.js @@ -0,0 +1,139 @@ +/** + * @typedef {Object} Node + * @prop {number} [depth = 0] + * @prop {string} key + * @prop {boolean} [word = false] + * @prop {Record} [children={}] + * @prop {Node} [fail] + * @prop {number} [count=0] + */ + +/** + * @param {string} key + * @param {number} depth + * @returns {Node} + */ +const createNode = (key, depth = 0) => ({ + depth, + key, + word: false, + children: {}, + fail: undefined, + count: 0 +}); + +/** + * @param {string[]} keys + */ +const createKeywordFilter = (keys) => { + const root = createNode('root'); + + const build = () => { + /** @type {Node[]} */ + const queue = []; + queue.push(root); + + let idx = 0; + while (queue.length > idx) { + const beginNode = queue[idx]; + const map = beginNode.children; + // eslint-disable-next-line guard-for-in -- plain object + for (const key in beginNode.children) { + const node = map[key]; + let failNode = beginNode.fail; + + while (failNode && !failNode.children[key]) { + failNode = failNode.fail; + } + + node.fail = failNode?.children[key] || root; + + queue.push(node); + } + + idx++; + } + }; + + /** + * @param {string} key + * @param {number} len + */ + const put = (key, len) => { + let node = root; + const lastIdx = len - 1; + node.count++; + for (let idx = 0; idx < len; idx++) { + const val = key[idx]; + const nextNode = node.children[val]; + + if (nextNode) { + nextNode.count++; + node = nextNode; + } else { + const newNode = createNode(val, idx + 1); + newNode.count = 1; + node.children[val] = newNode; + node = newNode; + } + + if (lastIdx === idx && node.depth) { + node.word = true; + } + } + }; + + /** + * @param {string} key + */ + const add = (key) => { + const len = key.length; + put(key, len); + build(); + + return true; + }; + + for (let idx = 0; idx < keys.length; idx++) { + add(keys[idx], false); + } + + build(); + + /** + * @param {string} text + * @returns {boolean} + */ + const search = (text) => { + let node = root; + /** @type {string[]} */ + const fText = []; + /** @type {string[]} */ + const oText = []; + + for (let i = 0, textLen = text.length; i < textLen; i++) { + // const key = text.charAt(i); + const key = text[i]; + + while (node && !node?.children[key]) { + node = node?.fail; + } + node = node?.children[key] || root; + + fText.push(key); + oText.push(key); + + if (node.word) { + return true; + } + } + + return false; + }; + + return { + search + }; +}; + +module.exports = createKeywordFilter; diff --git a/Build/lib/reject-data-source.js b/Build/lib/reject-data-source.js index 238dde6a..f3b67670 100644 --- a/Build/lib/reject-data-source.js +++ b/Build/lib/reject-data-source.js @@ -240,7 +240,8 @@ const PREDEFINED_ENFORCED_WHITELIST = [ 'ipfs.fleek.cool', 'repl.co', 'w3s.link', - 'translate.goog' + 'translate.goog', + 'backblazeb2.com' ]; module.exports.HOSTS = HOSTS;