Perf: speed up infra

This commit is contained in:
SukkaW
2023-09-13 17:28:34 +08:00
parent c2022ce61d
commit 23c9a963aa
14 changed files with 390 additions and 321 deletions

View File

@@ -1,5 +1,5 @@
// @ts-check
const { promises: fsPromises } = require('fs');
const fs = require('fs');
const fse = require('fs-extra');
const { readFileByLine } = require('./fetch-remote-text-by-line');
const { surgeDomainsetToClashDomainset, surgeRulesetToClashClassicalTextRuleset } = require('./clash');
@@ -28,18 +28,35 @@ async function compareAndWriteFile(linesA, filePath) {
}
}
if (!isEqual || index !== linesA.length - 1) {
await fsPromises.writeFile(
filePath,
linesA.join('\n'),
{ encoding: 'utf-8' }
);
if (!isEqual || index !== linesA.length) {
const stream = fs.createWriteStream(filePath, { encoding: 'utf-8' });
for (let i = 0, len = linesA.length; i < len; i++) {
// eslint-disable-next-line no-await-in-loop -- backpressure
await writeToStream(stream, linesA[i]);
// eslint-disable-next-line no-await-in-loop -- backpressure
await writeToStream(stream, '\n');
}
stream.end();
} else {
console.log(`Same Content, bail out writing: ${filePath}`);
}
}
module.exports.compareAndWriteFile = compareAndWriteFile;
/**
* @param {import('fs').WriteStream} stream
* @param {string} data
*/
async function writeToStream(stream, data) {
if (!stream.write(data)) {
return /** @type {Promise<void>} */(new Promise((resolve) => {
stream.once('drain', () => { resolve(); });
}));
}
return Promise.resolve();
}
/**
* @param {string} title
* @param {string[]} description
@@ -56,8 +73,7 @@ const withBannerArray = (title, description, date, content) => {
...description.map(line => (line ? `# ${line}` : '#')),
'########################################',
...content,
'################# END ###################',
''
'################# END ###################'
];
};
module.exports.withBannerArray = withBannerArray;

View File

@@ -4,6 +4,7 @@ const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-te
const { NetworkFilter } = require('@cliqz/adblocker');
const { normalizeDomain } = require('./is-domain-loose');
const { processLine } = require('./process-line');
const { performance } = require('perf_hooks');
const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
let foundDebugDomain = false;
@@ -98,17 +99,17 @@ const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)
/**
* @param {string | URL} filterRulesUrl
* @param {readonly (string | URL)[] | undefined} [fallbackUrls]
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
*/
async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
console.time(` - processFilterRules: ${filterRulesUrl}`);
const runStart = performance.now();
/** @type Set<string> */
const whitelistDomainSets = new Set();
/** @type Set<string> */
const blacklistDomainSets = new Set();
const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
@@ -120,289 +121,341 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
blacklistDomainSets.add(domainToBeAddedToBlack);
}
};
const addToWhiteList = (domainToBeAddedToWhite) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
? __addToBlackList
: (domainToBeAddedToBlack, isSubDomain) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
__addToBlackList(domainToBeAddedToBlack, isSubDomain);
};
const __addToWhiteList = (domainToBeAddedToWhite) => {
whitelistDomainSets.add(domainToBeAddedToWhite);
};
const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
? __addToWhiteList
: (domainToBeAddedToWhite) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
__addToWhiteList(domainToBeAddedToWhite);
};
let filterRules;
try {
const controller = new AbortController();
const signal = controller.signal;
let downloadTime = 0;
/** @type string[] */
filterRules = (
await Promise.any(
[filterRulesUrl, ...(fallbackUrls || [])].map(
url => fetchWithRetry(url, { signal })
.then(r => r.text())
.then(text => {
controller.abort();
return text;
})
const lineCb = (line) => {
const result = parse(line, includeThirdParties);
if (result) {
const flag = result[1];
const hostname = result[0];
switch (flag) {
case 0:
addToWhiteList(hostname);
break;
case 1:
addToBlackList(hostname, false);
break;
case 2:
addToBlackList(hostname, true);
break;
default:
throw new Error(`Unknown flag: ${flag}`);
}
}
};
if (!fallbackUrls || fallbackUrls.length === 0) {
const downloadStart = performance.now();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
lineCb(line.trim());
}
downloadTime = performance.now() - downloadStart;
} else {
let filterRules;
const downloadStart = performance.now();
try {
const controller = new AbortController();
const signal = controller.signal;
/** @type string[] */
filterRules = (
await Promise.any(
[filterRulesUrl, ...(fallbackUrls || [])].map(
url => fetchWithRetry(url, { signal })
.then(r => r.text())
.then(text => {
controller.abort();
return text;
})
)
)
)
).split('\n').map(line => line.trim());
} catch (e) {
console.log(`Download Rule for [${filterRulesUrl}] failed`);
throw e;
}
let hasParseFailed = false;
for (let i = 0, len = filterRules.length; i < len; i++) {
const line = filterRules[i].trim();
if (
line === ''
|| line[0] === '/'
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
// doesn't include
|| !line.includes('.') // rule with out dot can not be a domain
// includes
// || line.includes('#')
|| line.includes('!')
|| line.includes('?')
|| line.includes('*')
// || line.includes('=')
|| line.includes('[')
|| line.includes('(')
|| line.includes(']')
|| line.includes(')')
|| line.includes(',')
// || line.includes('~')
// || line.includes('&')
// || line.includes('%')
// ends with
|| line.endsWith('.')
|| line.endsWith('-')
|| line.endsWith('_')
// special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
|| ((line.includes('/') || line.includes(':')) && !line.includes('://'))
// || line.includes('$popup')
// || line.includes('$removeparam')
// || line.includes('$popunder')
) {
continue;
).split('\n').map(line => line.trim());
} catch (e) {
console.log(`Download Rule for [${filterRulesUrl}] failed`);
throw e;
}
downloadTime = performance.now() - downloadStart;
const filter = NetworkFilter.parse(line);
if (filter) {
if (
filter.isElemHide()
|| filter.isGenericHide()
|| filter.isSpecificHide()
|| filter.isRedirect()
|| filter.isRedirectRule()
|| filter.hasDomains()
|| filter.isCSP() // must not be csp rule
|| (!filter.fromAny() && !filter.fromDocument())
) {
// not supported type
continue;
}
if (
filter.hasHostname() // must have
&& filter.isPlain()
&& (!filter.isRegex())
&& (!filter.isFullRegex())
) {
const hostname = normalizeDomain(filter.getHostname());
if (hostname) {
if (filter.isException() || filter.isBadFilter()) {
addToWhiteList(hostname);
continue;
}
if (filter.firstParty() === filter.thirdParty()) {
addToBlackList(hostname, true);
continue;
}
if (filter.thirdParty()) {
if (includeThirdParties) {
addToBlackList(hostname, true);
}
continue;
}
if (filter.firstParty()) {
continue;
}
} else {
continue;
}
}
}
if (line.includes('$third-party') || line.includes('$frame')) {
continue;
}
const lineEndsWithCaret = line.endsWith('^');
const lineEndsWithCaretVerticalBar = line.endsWith('^|');
if (line[0] === '@' && line[1] === '@') {
if (line.endsWith('$cname')) {
continue;
}
if (
// (line.startsWith('@@|') || line.startsWith('@@.'))
(
line[2] === '|'
|| line[2] === '.'
)
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$genericblock')
|| line.endsWith('$document')
)
) {
const _domain = line
.replace('@@||', '')
.replace('@@|', '')
.replace('@@.', '')
.replace('^|', '')
.replace('^$genericblock', '')
.replace('$genericblock', '')
.replace('^$document', '')
.replace('$document', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToWhiteList(domain);
} else {
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
}
continue;
}
}
if (
line.startsWith('||')
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$cname')
)
) {
const _domain = line
.replace('||', '')
.replace('^|', '')
.replace('$cname', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
}
continue;
}
const lineStartsWithSingleDot = line.startsWith('.');
if (
lineStartsWithSingleDot
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = line
.replace('^|', '')
.replaceAll('^', '')
.slice(1)
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
}
continue;
}
if (
(
line.startsWith('://')
|| line.startsWith('http://')
|| line.startsWith('https://')
|| line.startsWith('|http://')
|| line.startsWith('|https://')
)
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = line
.replace('|https://', '')
.replace('https://', '')
.replace('|http://', '')
.replace('http://', '')
.replace('://', '')
.replace('^|', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
}
continue;
}
if (line[0] !== '|' && lineEndsWithCaret) {
const _domain = line.slice(0, -1);
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
}
continue;
}
const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
if (
tryNormalizeDomain
&& (
lineStartsWithSingleDot
? tryNormalizeDomain.length === line.length - 1
: tryNormalizeDomain === line
)
) {
addToBlackList(line, true);
continue;
}
if (
!line.endsWith('.js')
) {
hasParseFailed = true;
console.warn(' * [parse-filter E0010] can not parse:', line);
for (let i = 0, len = filterRules.length; i < len; i++) {
const line = filterRules[i].trim();
lineCb(line);
}
}
console.timeEnd(` - processFilterRules: ${filterRulesUrl}`);
console.log(` processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
return {
white: whitelistDomainSets,
black: blacklistDomainSets,
foundDebugDomain,
parseFailed: hasParseFailed
foundDebugDomain
};
}
/**
* @param {string} $line
* @param {boolean} includeThirdParties
* @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
*/
function parse($line, includeThirdParties) {
const line = $line.trim();
if (
line === ''
|| line[0] === '/'
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN.test(line)
// doesn't include
|| !line.includes('.') // rule with out dot can not be a domain
// includes
// || line.includes('#')
|| line.includes('!')
|| line.includes('?')
|| line.includes('*')
// || line.includes('=')
|| line.includes('[')
|| line.includes('(')
|| line.includes(']')
|| line.includes(')')
|| line.includes(',')
// || line.includes('~')
// || line.includes('&')
// || line.includes('%')
// ends with
|| line.endsWith('.')
|| line.endsWith('-')
|| line.endsWith('_')
// special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
|| ((line.includes('/') || line.includes(':')) && !line.includes('://'))
// || line.includes('$popup')
// || line.includes('$removeparam')
// || line.includes('$popunder')
) {
return null;
}
const filter = NetworkFilter.parse(line);
if (filter) {
if (
filter.isElemHide()
|| filter.isGenericHide()
|| filter.isSpecificHide()
|| filter.isRedirect()
|| filter.isRedirectRule()
|| filter.hasDomains()
|| filter.isCSP() // must not be csp rule
|| (!filter.fromAny() && !filter.fromDocument())
) {
// not supported type
return null;
}
if (
filter.hasHostname() // must have
&& filter.isPlain()
&& (!filter.isRegex())
&& (!filter.isFullRegex())
) {
const hostname = normalizeDomain(filter.getHostname());
if (hostname) {
if (filter.isException() || filter.isBadFilter()) {
return [hostname, 0];
}
if (filter.firstParty() === filter.thirdParty()) {
return [hostname, 2];
}
if (filter.thirdParty()) {
if (includeThirdParties) {
return [hostname, 2];
}
return null;
}
if (filter.firstParty()) {
return null;
}
} else {
return null;
}
}
}
if (line.includes('$third-party') || line.includes('$frame')) {
return null;
}
const lineEndsWithCaret = line.endsWith('^');
const lineEndsWithCaretVerticalBar = line.endsWith('^|');
if (line[0] === '@' && line[1] === '@') {
if (line.endsWith('$cname')) {
return null;
}
if (
// (line.startsWith('@@|') || line.startsWith('@@.'))
(
line[2] === '|'
|| line[2] === '.'
)
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$genericblock')
|| line.endsWith('$document')
)
) {
const _domain = line
.replace('@@||', '')
.replace('@@|', '')
.replace('@@.', '')
.replace('^|', '')
.replace('^$genericblock', '')
.replace('$genericblock', '')
.replace('^$document', '')
.replace('$document', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, 0];
}
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
return null;
}
}
if (
line.startsWith('||')
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$cname')
)
) {
const _domain = line
.replace('||', '')
.replace('^|', '')
.replace('$cname', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, 2];
}
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
return null;
}
const lineStartsWithSingleDot = line.startsWith('.');
if (
lineStartsWithSingleDot
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = line
.replace('^|', '')
.replaceAll('^', '')
.slice(1)
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, 2];
}
console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
return null;
}
if (
(
line.startsWith('://')
|| line.startsWith('http://')
|| line.startsWith('https://')
|| line.startsWith('|http://')
|| line.startsWith('|https://')
)
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = line
.replace('|https://', '')
.replace('https://', '')
.replace('|http://', '')
.replace('http://', '')
.replace('://', '')
.replace('^|', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, 1];
}
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
return null;
}
if (line[0] !== '|' && lineEndsWithCaret) {
const _domain = line.slice(0, -1);
const domain = normalizeDomain(_domain);
if (domain) {
return [domain, 1];
}
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
return null;
}
const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
if (
tryNormalizeDomain
&& (
lineStartsWithSingleDot
? tryNormalizeDomain.length === line.length - 1
: tryNormalizeDomain === line
)
) {
return [line, 2];
}
if (!line.endsWith('.js')) {
console.warn(' * [parse-filter E0010] can not parse:', line);
}
return null;
}
module.exports.processDomainLists = processDomainLists;
module.exports.processHosts = processHosts;
module.exports.processFilterRules = processFilterRules;

View File

@@ -0,0 +1,11 @@
const domainSorter = require('./stable-sort-domain');
const chai = require('chai');
const { describe, it } = require('mocha');
chai.should();
describe('stable-sort-domain', () => {
it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => {
domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov').should.eql(-1);
});
});

View File

@@ -278,7 +278,7 @@ class Trie {
* Static .from function taking an arbitrary iterable & converting it into
* a trie.
*
* @param {string[]} iterable - Target iterable.
* @param {string[] | Set<string>} iterable - Target iterable.
* @return {Trie}
*/
static from = iterable => {

View File

@@ -2,6 +2,7 @@ require('chai').should();
const Trie = require('./trie');
const assert = require('assert');
const { describe, it } = require('mocha');
describe('Trie', () => {
it('should be possible to add items to a Trie.', () => {