From 23c9a963aa8376d57296913001cfdee9c3e8dd09 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Wed, 13 Sep 2023 17:28:34 +0800 Subject: [PATCH] Perf: speed up infra --- Build/build-chn-cidr.js | 10 +- Build/build-domestic-ruleset.js | 3 +- Build/build-internal-cdn-rules.js | 11 +- Build/build-internal-reverse-chn-cidr.js | 8 +- Build/build-mitm-hostname.js | 2 +- Build/build-phishing-domainset.js | 4 +- Build/build-reject-domainset.js | 21 +- Build/build.js | 12 +- Build/download-previous-build.js | 1 + Build/lib/create-file.js | 34 +- Build/lib/parse-filter.js | 591 ++++++++++++----------- Build/lib/stable-sort-domain.test.js | 11 + Build/lib/trie.js | 2 +- Build/lib/trie.test.js | 1 + 14 files changed, 390 insertions(+), 321 deletions(-) create mode 100644 Build/lib/stable-sort-domain.test.js diff --git a/Build/build-chn-cidr.js b/Build/build-chn-cidr.js index 5a5864ea..8e9f2d90 100644 --- a/Build/build-chn-cidr.js +++ b/Build/build-chn-cidr.js @@ -15,17 +15,17 @@ const EXCLUDE_CIDRS = [ runner(__filename, async () => { const { exclude: excludeCidrs } = await import('cidr-tools-wasm'); - /** @type {Set} */ - const cidr = new Set(); + /** @type {string[]} */ + const cidr = []; for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')) { const l = processLine(line); if (l) { - cidr.add(l); + cidr.push(l); } } - console.log('Before Merge:', cidr.size); - const filteredCidr = excludeCidrs(Array.from(cidr), EXCLUDE_CIDRS, true); + console.log('Before Merge:', cidr.length); + const filteredCidr = excludeCidrs(cidr, EXCLUDE_CIDRS, true); console.log('After Merge:', filteredCidr.length); const description = [ diff --git a/Build/build-domestic-ruleset.js b/Build/build-domestic-ruleset.js index 57c99d0c..d6dec47b 100644 --- a/Build/build-domestic-ruleset.js +++ b/Build/build-domestic-ruleset.js @@ -55,8 +55,7 @@ runner(__filename, async () => { `${domain} = server:${dns}`, `*.${domain} = server:${dns}` ]) - ), - '' + ) ], path.resolve(__dirname, '../Modules/sukka_local_dns_mapping.sgmodule') ) diff --git a/Build/build-internal-cdn-rules.js b/Build/build-internal-cdn-rules.js index 4a00bd4d..842f85b0 100644 --- a/Build/build-internal-cdn-rules.js +++ b/Build/build-internal-cdn-rules.js @@ -1,5 +1,4 @@ // @ts-check -const fs = require('fs'); const fse = require('fs-extra'); const path = require('path'); const { isDomainLoose } = require('./lib/is-domain-loose'); @@ -8,6 +7,7 @@ const { processLine } = require('./lib/process-line'); const { readFileByLine } = require('./lib/fetch-remote-text-by-line'); const domainSorter = require('./lib/stable-sort-domain'); const { runner } = require('./lib/trace-runner'); +const { compareAndWriteFile } = require('./lib/create-file'); /** * @param {string} string @@ -77,12 +77,11 @@ runner(__filename, async () => { fse.ensureDir(path.resolve(__dirname, '../List/internal')) ]); - await fs.promises.writeFile( - path.resolve(__dirname, '../List/internal/cdn.txt'), + await compareAndWriteFile( [ ...Array.from(set).sort(domainSorter).map(i => `SUFFIX,${i}`), - ...Array.from(keywords).sort().map(i => `REGEX,${i}`), - '' - ].join('\n') + ...Array.from(keywords).sort().map(i => `REGEX,${i}`) + ], + path.resolve(__dirname, '../List/internal/cdn.txt') ); }); diff --git a/Build/build-internal-reverse-chn-cidr.js b/Build/build-internal-reverse-chn-cidr.js index 057be3ec..965a9d26 100644 --- a/Build/build-internal-reverse-chn-cidr.js +++ b/Build/build-internal-reverse-chn-cidr.js @@ -27,18 +27,18 @@ const RESERVED_IPV4_CIDR = [ runner(__filename, async () => { const { exclude } = await import('cidr-tools-wasm'); - /** @type {Set} */ - const cidr = new Set(); + /** @type {string[]} */ + const cidr = []; for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')) { const l = processLine(line); if (l) { - cidr.add(l); + cidr.push(l); } } const reversedCidr = exclude( ['0.0.0.0/0'], - RESERVED_IPV4_CIDR.concat(Array.from(cidr)), + RESERVED_IPV4_CIDR.concat(cidr), true ); diff --git a/Build/build-mitm-hostname.js b/Build/build-mitm-hostname.js index 5e2bd2d1..a8effbeb 100644 --- a/Build/build-mitm-hostname.js +++ b/Build/build-mitm-hostname.js @@ -95,7 +95,7 @@ const PRESET_MITM_HOSTNAMES = [ })); let mitmDomains = new Set(PRESET_MITM_HOSTNAMES); // Special case for parsed failed - const parsedFailures = new Set(); + const parsedFailures = []; const dedupedUrlRegexPaths = [...new Set(urlRegexPaths)]; diff --git a/Build/build-phishing-domainset.js b/Build/build-phishing-domainset.js index a588efcf..6e7524d2 100644 --- a/Build/build-phishing-domainset.js +++ b/Build/build-phishing-domainset.js @@ -62,9 +62,7 @@ const BLACK_TLD = new Set([ runner(__filename, async () => { const domainSet = Array.from( - ( - await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt') - ).black + (await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt')).black ); const domainCountMap = {}; diff --git a/Build/build-reject-domainset.js b/Build/build-reject-domainset.js index 6134a9b5..b7a18326 100644 --- a/Build/build-reject-domainset.js +++ b/Build/build-reject-domainset.js @@ -50,13 +50,9 @@ const domainSuffixSet = new Set(); const { white, black, foundDebugDomain } = i; if (foundDebugDomain) { shouldStop = true; + // we should not break here, as we want to see full matches from all data source } - white.forEach(i => { - // if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) { - // return; - // } - filterRuleWhitelistDomainSets.add(i); - }); + white.forEach(i => filterRuleWhitelistDomainSets.add(i)); black.forEach(i => domainSets.add(i)); } else { process.exitCode = 1; @@ -71,15 +67,9 @@ const domainSuffixSet = new Set(); if (i) { const { white, black } = i; white.forEach(i => { - // if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) { - // return; - // } filterRuleWhitelistDomainSets.add(i); }); black.forEach(i => { - // if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) { - // return; - // } filterRuleWhitelistDomainSets.add(i); }); } else { @@ -89,7 +79,8 @@ const domainSuffixSet = new Set(); }))) ]); - const trie0 = Trie.from(Array.from(filterRuleWhitelistDomainSets)); + // remove pre-defined enforced blacklist from whitelist + const trie0 = Trie.from(filterRuleWhitelistDomainSets); PREDEFINED_ENFORCED_BACKLIST.forEach(enforcedBlack => { trie0.find(enforcedBlack).forEach(found => filterRuleWhitelistDomainSets.delete(found)); }); @@ -140,7 +131,7 @@ const domainSuffixSet = new Set(); const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet)); - const trie1 = Trie.from(Array.from(domainSets)); + const trie1 = Trie.from(domainSets); domainSuffixSet.forEach(suffix => { trie1.find(suffix, true).forEach(f => domainSets.delete(f)); }); @@ -149,7 +140,7 @@ const domainSuffixSet = new Set(); }); // Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`) - const trieWhite = Trie.from(Array.from(filterRuleWhitelistDomainSets)); + const trieWhite = Trie.from(filterRuleWhitelistDomainSets); for (const domain of domainSets) { if (domain[0] === '.') { if (trieWhite.contains(domain)) { diff --git a/Build/build.js b/Build/build.js index 2b0056cc..9c076865 100644 --- a/Build/build.js +++ b/Build/build.js @@ -49,8 +49,8 @@ runner(__filename, async () => { * @param {string} sourcePath */ const processFile = async (sourcePath) => { - /** @type {Set} */ - const lines = new Set(); + /** @type {string[]} */ + const lines = []; let title = ''; /** @type {string[]} */ @@ -73,7 +73,7 @@ const processFile = async (sourcePath) => { const l = processLine(line); if (l) { - lines.add(l); + lines.push(l); } } @@ -89,7 +89,7 @@ async function transformDomainset(sourcePath, relativePath) { if (!res) return; const [title, descriptions, lines] = res; - const deduped = domainDeduper(Array.from(lines)); + const deduped = domainDeduper(lines); const description = [ 'License: AGPL 3.0', 'Homepage: https://ruleset.skk.moe', @@ -121,7 +121,7 @@ async function transformDomainset(sourcePath, relativePath) { async function transformRuleset(sourcePath, relativePath) { const res = await processFile(sourcePath); if (!res) return; - const [title, descriptions, set] = res; + const [title, descriptions, lines] = res; const description = [ 'License: AGPL 3.0', @@ -138,7 +138,7 @@ async function transformRuleset(sourcePath, relativePath) { title, description, new Date(), - Array.from(set), + lines, 'ruleset', path.resolve(outputSurgeDir, relativePath), path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`) diff --git a/Build/download-previous-build.js b/Build/download-previous-build.js index 5f431d6c..5583b6c6 100644 --- a/Build/download-previous-build.js +++ b/Build/download-previous-build.js @@ -36,6 +36,7 @@ runner(__filename, async () => { if (!allFileExists) { console.log(`File not exists: ${line}`); + break; } } } diff --git a/Build/lib/create-file.js b/Build/lib/create-file.js index 8fbd0180..0ee7a11a 100644 --- a/Build/lib/create-file.js +++ b/Build/lib/create-file.js @@ -1,5 +1,5 @@ // @ts-check -const { promises: fsPromises } = require('fs'); +const fs = require('fs'); const fse = require('fs-extra'); const { readFileByLine } = require('./fetch-remote-text-by-line'); const { surgeDomainsetToClashDomainset, surgeRulesetToClashClassicalTextRuleset } = require('./clash'); @@ -28,18 +28,35 @@ async function compareAndWriteFile(linesA, filePath) { } } - if (!isEqual || index !== linesA.length - 1) { - await fsPromises.writeFile( - filePath, - linesA.join('\n'), - { encoding: 'utf-8' } - ); + if (!isEqual || index !== linesA.length) { + const stream = fs.createWriteStream(filePath, { encoding: 'utf-8' }); + + for (let i = 0, len = linesA.length; i < len; i++) { + // eslint-disable-next-line no-await-in-loop -- backpressure + await writeToStream(stream, linesA[i]); + // eslint-disable-next-line no-await-in-loop -- backpressure + await writeToStream(stream, '\n'); + } + stream.end(); } else { console.log(`Same Content, bail out writing: ${filePath}`); } } module.exports.compareAndWriteFile = compareAndWriteFile; +/** + * @param {import('fs').WriteStream} stream + * @param {string} data + */ +async function writeToStream(stream, data) { + if (!stream.write(data)) { + return /** @type {Promise} */(new Promise((resolve) => { + stream.once('drain', () => { resolve(); }); + })); + } + return Promise.resolve(); +} + /** * @param {string} title * @param {string[]} description @@ -56,8 +73,7 @@ const withBannerArray = (title, description, date, content) => { ...description.map(line => (line ? `# ${line}` : '#')), '########################################', ...content, - '################# END ###################', - '' + '################# END ###################' ]; }; module.exports.withBannerArray = withBannerArray; diff --git a/Build/lib/parse-filter.js b/Build/lib/parse-filter.js index e0a1f73a..19d0e544 100644 --- a/Build/lib/parse-filter.js +++ b/Build/lib/parse-filter.js @@ -4,6 +4,7 @@ const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-te const { NetworkFilter } = require('@cliqz/adblocker'); const { normalizeDomain } = require('./is-domain-loose'); const { processLine } = require('./process-line'); +const { performance } = require('perf_hooks'); const DEBUG_DOMAIN_TO_FIND = null; // example.com | null let foundDebugDomain = false; @@ -98,17 +99,17 @@ const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder) /** * @param {string | URL} filterRulesUrl * @param {readonly (string | URL)[] | undefined} [fallbackUrls] - * @returns {Promise<{ white: Set, black: Set, foundDebugDomain: boolean, parseFailed: boolean }>} + * @returns {Promise<{ white: Set, black: Set, foundDebugDomain: boolean }>} */ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) { - console.time(` - processFilterRules: ${filterRulesUrl}`); + const runStart = performance.now(); /** @type Set */ const whitelistDomainSets = new Set(); /** @type Set */ const blacklistDomainSets = new Set(); - const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => { + const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => { if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) { warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); foundDebugDomain = true; @@ -120,289 +121,341 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart blacklistDomainSets.add(domainToBeAddedToBlack); } }; - const addToWhiteList = (domainToBeAddedToWhite) => { - if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) { - warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND); - foundDebugDomain = true; - } + const addToBlackList = DEBUG_DOMAIN_TO_FIND == null + ? __addToBlackList + : (domainToBeAddedToBlack, isSubDomain) => { + if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) { + warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); + foundDebugDomain = true; + } + __addToBlackList(domainToBeAddedToBlack, isSubDomain); + }; + + const __addToWhiteList = (domainToBeAddedToWhite) => { whitelistDomainSets.add(domainToBeAddedToWhite); }; + const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null + ? __addToWhiteList + : (domainToBeAddedToWhite) => { + if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) { + warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND); + foundDebugDomain = true; + } + __addToWhiteList(domainToBeAddedToWhite); + }; - let filterRules; - try { - const controller = new AbortController(); - const signal = controller.signal; + let downloadTime = 0; - /** @type string[] */ - filterRules = ( - await Promise.any( - [filterRulesUrl, ...(fallbackUrls || [])].map( - url => fetchWithRetry(url, { signal }) - .then(r => r.text()) - .then(text => { - controller.abort(); - return text; - }) + const lineCb = (line) => { + const result = parse(line, includeThirdParties); + if (result) { + const flag = result[1]; + const hostname = result[0]; + switch (flag) { + case 0: + addToWhiteList(hostname); + break; + case 1: + addToBlackList(hostname, false); + break; + case 2: + addToBlackList(hostname, true); + break; + default: + throw new Error(`Unknown flag: ${flag}`); + } + } + }; + + if (!fallbackUrls || fallbackUrls.length === 0) { + const downloadStart = performance.now(); + for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) { + lineCb(line.trim()); + } + downloadTime = performance.now() - downloadStart; + } else { + let filterRules; + + const downloadStart = performance.now(); + try { + const controller = new AbortController(); + const signal = controller.signal; + + /** @type string[] */ + filterRules = ( + await Promise.any( + [filterRulesUrl, ...(fallbackUrls || [])].map( + url => fetchWithRetry(url, { signal }) + .then(r => r.text()) + .then(text => { + controller.abort(); + return text; + }) + ) ) - ) - ).split('\n').map(line => line.trim()); - } catch (e) { - console.log(`Download Rule for [${filterRulesUrl}] failed`); - throw e; - } - - let hasParseFailed = false; - - for (let i = 0, len = filterRules.length; i < len; i++) { - const line = filterRules[i].trim(); - - if ( - line === '' - || line[0] === '/' - || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line) - // doesn't include - || !line.includes('.') // rule with out dot can not be a domain - // includes - // || line.includes('#') - || line.includes('!') - || line.includes('?') - || line.includes('*') - // || line.includes('=') - || line.includes('[') - || line.includes('(') - || line.includes(']') - || line.includes(')') - || line.includes(',') - // || line.includes('~') - // || line.includes('&') - // || line.includes('%') - // ends with - || line.endsWith('.') - || line.endsWith('-') - || line.endsWith('_') - // special modifier - || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line) - || ((line.includes('/') || line.includes(':')) && !line.includes('://')) - // || line.includes('$popup') - // || line.includes('$removeparam') - // || line.includes('$popunder') - ) { - continue; + ).split('\n').map(line => line.trim()); + } catch (e) { + console.log(`Download Rule for [${filterRulesUrl}] failed`); + throw e; } + downloadTime = performance.now() - downloadStart; - const filter = NetworkFilter.parse(line); - if (filter) { - if ( - filter.isElemHide() - || filter.isGenericHide() - || filter.isSpecificHide() - || filter.isRedirect() - || filter.isRedirectRule() - || filter.hasDomains() - || filter.isCSP() // must not be csp rule - || (!filter.fromAny() && !filter.fromDocument()) - ) { - // not supported type - continue; - } - - if ( - filter.hasHostname() // must have - && filter.isPlain() - && (!filter.isRegex()) - && (!filter.isFullRegex()) - ) { - const hostname = normalizeDomain(filter.getHostname()); - if (hostname) { - if (filter.isException() || filter.isBadFilter()) { - addToWhiteList(hostname); - continue; - } - if (filter.firstParty() === filter.thirdParty()) { - addToBlackList(hostname, true); - continue; - } - if (filter.thirdParty()) { - if (includeThirdParties) { - addToBlackList(hostname, true); - } - continue; - } - if (filter.firstParty()) { - continue; - } - } else { - continue; - } - } - } - - if (line.includes('$third-party') || line.includes('$frame')) { - continue; - } - - const lineEndsWithCaret = line.endsWith('^'); - const lineEndsWithCaretVerticalBar = line.endsWith('^|'); - - if (line[0] === '@' && line[1] === '@') { - if (line.endsWith('$cname')) { - continue; - } - - if ( - // (line.startsWith('@@|') || line.startsWith('@@.')) - ( - line[2] === '|' - || line[2] === '.' - ) - && ( - lineEndsWithCaret - || lineEndsWithCaretVerticalBar - || line.endsWith('$genericblock') - || line.endsWith('$document') - ) - ) { - const _domain = line - .replace('@@||', '') - .replace('@@|', '') - .replace('@@.', '') - .replace('^|', '') - .replace('^$genericblock', '') - .replace('$genericblock', '') - .replace('^$document', '') - .replace('$document', '') - .replaceAll('^', '') - .trim(); - - const domain = normalizeDomain(_domain); - if (domain) { - addToWhiteList(domain); - } else { - console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain); - } - - continue; - } - } - - if ( - line.startsWith('||') - && ( - lineEndsWithCaret - || lineEndsWithCaretVerticalBar - || line.endsWith('$cname') - ) - ) { - const _domain = line - .replace('||', '') - .replace('^|', '') - .replace('$cname', '') - .replaceAll('^', '') - .trim(); - - const domain = normalizeDomain(_domain); - if (domain) { - addToBlackList(domain, true); - } else { - console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain); - } - continue; - } - - const lineStartsWithSingleDot = line.startsWith('.'); - if ( - lineStartsWithSingleDot - && ( - lineEndsWithCaret - || lineEndsWithCaretVerticalBar - ) - ) { - const _domain = line - .replace('^|', '') - .replaceAll('^', '') - .slice(1) - .trim(); - - const domain = normalizeDomain(_domain); - if (domain) { - addToBlackList(domain, true); - } else { - console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain); - } - continue; - } - if ( - ( - line.startsWith('://') - || line.startsWith('http://') - || line.startsWith('https://') - || line.startsWith('|http://') - || line.startsWith('|https://') - ) - && ( - lineEndsWithCaret - || lineEndsWithCaretVerticalBar - ) - ) { - const _domain = line - .replace('|https://', '') - .replace('https://', '') - .replace('|http://', '') - .replace('http://', '') - .replace('://', '') - .replace('^|', '') - .replaceAll('^', '') - .trim(); - - const domain = normalizeDomain(_domain); - if (domain) { - addToBlackList(domain, false); - } else { - console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain); - } - continue; - } - if (line[0] !== '|' && lineEndsWithCaret) { - const _domain = line.slice(0, -1); - const domain = normalizeDomain(_domain); - if (domain) { - addToBlackList(domain, false); - } else { - console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain); - } - continue; - } - const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line); - if ( - tryNormalizeDomain - && ( - lineStartsWithSingleDot - ? tryNormalizeDomain.length === line.length - 1 - : tryNormalizeDomain === line - ) - ) { - addToBlackList(line, true); - continue; - } - - if ( - !line.endsWith('.js') - ) { - hasParseFailed = true; - console.warn(' * [parse-filter E0010] can not parse:', line); + for (let i = 0, len = filterRules.length; i < len; i++) { + const line = filterRules[i].trim(); + lineCb(line); } } - console.timeEnd(` - processFilterRules: ${filterRulesUrl}`); + console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`); + console.log(` └── download time: ${downloadTime.toFixed(3)}ms`); return { white: whitelistDomainSets, black: blacklistDomainSets, - foundDebugDomain, - parseFailed: hasParseFailed + foundDebugDomain }; } +/** + * @param {string} $line + * @param {boolean} includeThirdParties + * @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain + */ +function parse($line, includeThirdParties) { + const line = $line.trim(); + + if ( + line === '' + || line[0] === '/' + || R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line) + // doesn't include + || !line.includes('.') // rule with out dot can not be a domain + // includes + // || line.includes('#') + || line.includes('!') + || line.includes('?') + || line.includes('*') + // || line.includes('=') + || line.includes('[') + || line.includes('(') + || line.includes(']') + || line.includes(')') + || line.includes(',') + // || line.includes('~') + // || line.includes('&') + // || line.includes('%') + // ends with + || line.endsWith('.') + || line.endsWith('-') + || line.endsWith('_') + // special modifier + || R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line) + || ((line.includes('/') || line.includes(':')) && !line.includes('://')) + // || line.includes('$popup') + // || line.includes('$removeparam') + // || line.includes('$popunder') + ) { + return null; + } + + const filter = NetworkFilter.parse(line); + if (filter) { + if ( + filter.isElemHide() + || filter.isGenericHide() + || filter.isSpecificHide() + || filter.isRedirect() + || filter.isRedirectRule() + || filter.hasDomains() + || filter.isCSP() // must not be csp rule + || (!filter.fromAny() && !filter.fromDocument()) + ) { + // not supported type + return null; + } + + if ( + filter.hasHostname() // must have + && filter.isPlain() + && (!filter.isRegex()) + && (!filter.isFullRegex()) + ) { + const hostname = normalizeDomain(filter.getHostname()); + if (hostname) { + if (filter.isException() || filter.isBadFilter()) { + return [hostname, 0]; + } + if (filter.firstParty() === filter.thirdParty()) { + return [hostname, 2]; + } + if (filter.thirdParty()) { + if (includeThirdParties) { + return [hostname, 2]; + } + return null; + } + if (filter.firstParty()) { + return null; + } + } else { + return null; + } + } + } + + if (line.includes('$third-party') || line.includes('$frame')) { + return null; + } + + const lineEndsWithCaret = line.endsWith('^'); + const lineEndsWithCaretVerticalBar = line.endsWith('^|'); + + if (line[0] === '@' && line[1] === '@') { + if (line.endsWith('$cname')) { + return null; + } + + if ( + // (line.startsWith('@@|') || line.startsWith('@@.')) + ( + line[2] === '|' + || line[2] === '.' + ) + && ( + lineEndsWithCaret + || lineEndsWithCaretVerticalBar + || line.endsWith('$genericblock') + || line.endsWith('$document') + ) + ) { + const _domain = line + .replace('@@||', '') + .replace('@@|', '') + .replace('@@.', '') + .replace('^|', '') + .replace('^$genericblock', '') + .replace('$genericblock', '') + .replace('^$document', '') + .replace('$document', '') + .replaceAll('^', '') + .trim(); + + const domain = normalizeDomain(_domain); + if (domain) { + return [domain, 0]; + } + console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain); + + return null; + } + } + + if ( + line.startsWith('||') + && ( + lineEndsWithCaret + || lineEndsWithCaretVerticalBar + || line.endsWith('$cname') + ) + ) { + const _domain = line + .replace('||', '') + .replace('^|', '') + .replace('$cname', '') + .replaceAll('^', '') + .trim(); + + const domain = normalizeDomain(_domain); + if (domain) { + return [domain, 2]; + } + console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain); + + return null; + } + + const lineStartsWithSingleDot = line.startsWith('.'); + if ( + lineStartsWithSingleDot + && ( + lineEndsWithCaret + || lineEndsWithCaretVerticalBar + ) + ) { + const _domain = line + .replace('^|', '') + .replaceAll('^', '') + .slice(1) + .trim(); + + const domain = normalizeDomain(_domain); + if (domain) { + return [domain, 2]; + } + console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain); + + return null; + } + if ( + ( + line.startsWith('://') + || line.startsWith('http://') + || line.startsWith('https://') + || line.startsWith('|http://') + || line.startsWith('|https://') + ) + && ( + lineEndsWithCaret + || lineEndsWithCaretVerticalBar + ) + ) { + const _domain = line + .replace('|https://', '') + .replace('https://', '') + .replace('|http://', '') + .replace('http://', '') + .replace('://', '') + .replace('^|', '') + .replaceAll('^', '') + .trim(); + + const domain = normalizeDomain(_domain); + if (domain) { + return [domain, 1]; + } + console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain); + + return null; + } + if (line[0] !== '|' && lineEndsWithCaret) { + const _domain = line.slice(0, -1); + const domain = normalizeDomain(_domain); + if (domain) { + return [domain, 1]; + } + console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain); + + return null; + } + const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line); + if ( + tryNormalizeDomain + && ( + lineStartsWithSingleDot + ? tryNormalizeDomain.length === line.length - 1 + : tryNormalizeDomain === line + ) + ) { + return [line, 2]; + } + + if (!line.endsWith('.js')) { + console.warn(' * [parse-filter E0010] can not parse:', line); + } + + return null; +} + module.exports.processDomainLists = processDomainLists; module.exports.processHosts = processHosts; module.exports.processFilterRules = processFilterRules; diff --git a/Build/lib/stable-sort-domain.test.js b/Build/lib/stable-sort-domain.test.js new file mode 100644 index 00000000..700e2e10 --- /dev/null +++ b/Build/lib/stable-sort-domain.test.js @@ -0,0 +1,11 @@ +const domainSorter = require('./stable-sort-domain'); +const chai = require('chai'); +const { describe, it } = require('mocha'); + +chai.should(); + +describe('stable-sort-domain', () => { + it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => { + domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov').should.eql(-1); + }); +}); diff --git a/Build/lib/trie.js b/Build/lib/trie.js index 968cb5d6..c5da0579 100644 --- a/Build/lib/trie.js +++ b/Build/lib/trie.js @@ -278,7 +278,7 @@ class Trie { * Static .from function taking an arbitrary iterable & converting it into * a trie. * - * @param {string[]} iterable - Target iterable. + * @param {string[] | Set} iterable - Target iterable. * @return {Trie} */ static from = iterable => { diff --git a/Build/lib/trie.test.js b/Build/lib/trie.test.js index c57c362f..6cdf3d4b 100644 --- a/Build/lib/trie.test.js +++ b/Build/lib/trie.test.js @@ -2,6 +2,7 @@ require('chai').should(); const Trie = require('./trie'); const assert = require('assert'); +const { describe, it } = require('mocha'); describe('Trie', () => { it('should be possible to add items to a Trie.', () => {