Perf: speed up infra

This commit is contained in:
SukkaW 2023-09-13 17:28:34 +08:00
parent c2022ce61d
commit 23c9a963aa
14 changed files with 390 additions and 321 deletions

View File

@ -15,17 +15,17 @@ const EXCLUDE_CIDRS = [
runner(__filename, async () => {
const { exclude: excludeCidrs } = await import('cidr-tools-wasm');
/** @type {Set<string>} */
const cidr = new Set();
/** @type {string[]} */
const cidr = [];
for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')) {
const l = processLine(line);
if (l) {
cidr.add(l);
cidr.push(l);
}
}
console.log('Before Merge:', cidr.size);
const filteredCidr = excludeCidrs(Array.from(cidr), EXCLUDE_CIDRS, true);
console.log('Before Merge:', cidr.length);
const filteredCidr = excludeCidrs(cidr, EXCLUDE_CIDRS, true);
console.log('After Merge:', filteredCidr.length);
const description = [

View File

@ -55,8 +55,7 @@ runner(__filename, async () => {
`${domain} = server:${dns}`,
`*.${domain} = server:${dns}`
])
),
''
)
],
path.resolve(__dirname, '../Modules/sukka_local_dns_mapping.sgmodule')
)

View File

@ -1,5 +1,4 @@
// @ts-check
const fs = require('fs');
const fse = require('fs-extra');
const path = require('path');
const { isDomainLoose } = require('./lib/is-domain-loose');
@ -8,6 +7,7 @@ const { processLine } = require('./lib/process-line');
const { readFileByLine } = require('./lib/fetch-remote-text-by-line');
const domainSorter = require('./lib/stable-sort-domain');
const { runner } = require('./lib/trace-runner');
const { compareAndWriteFile } = require('./lib/create-file');
/**
* @param {string} string
@ -77,12 +77,11 @@ runner(__filename, async () => {
fse.ensureDir(path.resolve(__dirname, '../List/internal'))
]);
await fs.promises.writeFile(
path.resolve(__dirname, '../List/internal/cdn.txt'),
await compareAndWriteFile(
[
...Array.from(set).sort(domainSorter).map(i => `SUFFIX,${i}`),
...Array.from(keywords).sort().map(i => `REGEX,${i}`),
''
].join('\n')
...Array.from(keywords).sort().map(i => `REGEX,${i}`)
],
path.resolve(__dirname, '../List/internal/cdn.txt')
);
});

View File

@ -27,18 +27,18 @@ const RESERVED_IPV4_CIDR = [
runner(__filename, async () => {
const { exclude } = await import('cidr-tools-wasm');
/** @type {Set<string>} */
const cidr = new Set();
/** @type {string[]} */
const cidr = [];
for await (const line of await fetchRemoteTextAndCreateReadlineInterface('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')) {
const l = processLine(line);
if (l) {
cidr.add(l);
cidr.push(l);
}
}
const reversedCidr = exclude(
['0.0.0.0/0'],
RESERVED_IPV4_CIDR.concat(Array.from(cidr)),
RESERVED_IPV4_CIDR.concat(cidr),
true
);

View File

@ -95,7 +95,7 @@ const PRESET_MITM_HOSTNAMES = [
}));
let mitmDomains = new Set(PRESET_MITM_HOSTNAMES); // Special case for parsed failed
const parsedFailures = new Set();
const parsedFailures = [];
const dedupedUrlRegexPaths = [...new Set(urlRegexPaths)];

View File

@ -62,9 +62,7 @@ const BLACK_TLD = new Set([
runner(__filename, async () => {
const domainSet = Array.from(
(
await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt')
).black
(await processFilterRules('https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt')).black
);
const domainCountMap = {};

View File

@ -50,13 +50,9 @@ const domainSuffixSet = new Set();
const { white, black, foundDebugDomain } = i;
if (foundDebugDomain) {
shouldStop = true;
// we should not break here, as we want to see full matches from all data source
}
white.forEach(i => {
// if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) {
// return;
// }
filterRuleWhitelistDomainSets.add(i);
});
white.forEach(i => filterRuleWhitelistDomainSets.add(i));
black.forEach(i => domainSets.add(i));
} else {
process.exitCode = 1;
@ -71,15 +67,9 @@ const domainSuffixSet = new Set();
if (i) {
const { white, black } = i;
white.forEach(i => {
// if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) {
// return;
// }
filterRuleWhitelistDomainSets.add(i);
});
black.forEach(i => {
// if (PREDEFINED_ENFORCED_BACKLIST.some(j => i.endsWith(j))) {
// return;
// }
filterRuleWhitelistDomainSets.add(i);
});
} else {
@ -89,7 +79,8 @@ const domainSuffixSet = new Set();
})))
]);
const trie0 = Trie.from(Array.from(filterRuleWhitelistDomainSets));
// remove pre-defined enforced blacklist from whitelist
const trie0 = Trie.from(filterRuleWhitelistDomainSets);
PREDEFINED_ENFORCED_BACKLIST.forEach(enforcedBlack => {
trie0.find(enforcedBlack).forEach(found => filterRuleWhitelistDomainSets.delete(found));
});
@ -140,7 +131,7 @@ const domainSuffixSet = new Set();
const kwfilter = createKeywordFilter(Array.from(domainKeywordsSet));
const trie1 = Trie.from(Array.from(domainSets));
const trie1 = Trie.from(domainSets);
domainSuffixSet.forEach(suffix => {
trie1.find(suffix, true).forEach(f => domainSets.delete(f));
});
@ -149,7 +140,7 @@ const domainSuffixSet = new Set();
});
// Build whitelist trie, to handle case like removing `g.msn.com` due to white `.g.msn.com` (`@@||g.msn.com`)
const trieWhite = Trie.from(Array.from(filterRuleWhitelistDomainSets));
const trieWhite = Trie.from(filterRuleWhitelistDomainSets);
for (const domain of domainSets) {
if (domain[0] === '.') {
if (trieWhite.contains(domain)) {

View File

@ -49,8 +49,8 @@ runner(__filename, async () => {
* @param {string} sourcePath
*/
const processFile = async (sourcePath) => {
/** @type {Set<string>} */
const lines = new Set();
/** @type {string[]} */
const lines = [];
let title = '';
/** @type {string[]} */
@ -73,7 +73,7 @@ const processFile = async (sourcePath) => {
const l = processLine(line);
if (l) {
lines.add(l);
lines.push(l);
}
}
@ -89,7 +89,7 @@ async function transformDomainset(sourcePath, relativePath) {
if (!res) return;
const [title, descriptions, lines] = res;
const deduped = domainDeduper(Array.from(lines));
const deduped = domainDeduper(lines);
const description = [
'License: AGPL 3.0',
'Homepage: https://ruleset.skk.moe',
@ -121,7 +121,7 @@ async function transformDomainset(sourcePath, relativePath) {
async function transformRuleset(sourcePath, relativePath) {
const res = await processFile(sourcePath);
if (!res) return;
const [title, descriptions, set] = res;
const [title, descriptions, lines] = res;
const description = [
'License: AGPL 3.0',
@ -138,7 +138,7 @@ async function transformRuleset(sourcePath, relativePath) {
title,
description,
new Date(),
Array.from(set),
lines,
'ruleset',
path.resolve(outputSurgeDir, relativePath),
path.resolve(outputClashDir, `${relativePath.slice(0, -path.extname(relativePath).length)}.txt`)

View File

@ -36,6 +36,7 @@ runner(__filename, async () => {
if (!allFileExists) {
console.log(`File not exists: ${line}`);
break;
}
}
}

View File

@ -1,5 +1,5 @@
// @ts-check
const { promises: fsPromises } = require('fs');
const fs = require('fs');
const fse = require('fs-extra');
const { readFileByLine } = require('./fetch-remote-text-by-line');
const { surgeDomainsetToClashDomainset, surgeRulesetToClashClassicalTextRuleset } = require('./clash');
@ -28,18 +28,35 @@ async function compareAndWriteFile(linesA, filePath) {
}
}
if (!isEqual || index !== linesA.length - 1) {
await fsPromises.writeFile(
filePath,
linesA.join('\n'),
{ encoding: 'utf-8' }
);
if (!isEqual || index !== linesA.length) {
const stream = fs.createWriteStream(filePath, { encoding: 'utf-8' });
for (let i = 0, len = linesA.length; i < len; i++) {
// eslint-disable-next-line no-await-in-loop -- backpressure
await writeToStream(stream, linesA[i]);
// eslint-disable-next-line no-await-in-loop -- backpressure
await writeToStream(stream, '\n');
}
stream.end();
} else {
console.log(`Same Content, bail out writing: ${filePath}`);
}
}
module.exports.compareAndWriteFile = compareAndWriteFile;
/**
* @param {import('fs').WriteStream} stream
* @param {string} data
*/
async function writeToStream(stream, data) {
if (!stream.write(data)) {
return /** @type {Promise<void>} */(new Promise((resolve) => {
stream.once('drain', () => { resolve(); });
}));
}
return Promise.resolve();
}
/**
* @param {string} title
* @param {string[]} description
@ -56,8 +73,7 @@ const withBannerArray = (title, description, date, content) => {
...description.map(line => (line ? `# ${line}` : '#')),
'########################################',
...content,
'################# END ###################',
''
'################# END ###################'
];
};
module.exports.withBannerArray = withBannerArray;

View File

@ -4,6 +4,7 @@ const { fetchRemoteTextAndCreateReadlineInterface } = require('./fetch-remote-te
const { NetworkFilter } = require('@cliqz/adblocker');
const { normalizeDomain } = require('./is-domain-loose');
const { processLine } = require('./process-line');
const { performance } = require('perf_hooks');
const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
let foundDebugDomain = false;
@ -98,17 +99,17 @@ const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)
/**
* @param {string | URL} filterRulesUrl
* @param {readonly (string | URL)[] | undefined} [fallbackUrls]
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
*/
async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdParties = false) {
console.time(` - processFilterRules: ${filterRulesUrl}`);
const runStart = performance.now();
/** @type Set<string> */
const whitelistDomainSets = new Set();
/** @type Set<string> */
const blacklistDomainSets = new Set();
const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
const __addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
@ -120,15 +121,62 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
blacklistDomainSets.add(domainToBeAddedToBlack);
}
};
const addToWhiteList = (domainToBeAddedToWhite) => {
const addToBlackList = DEBUG_DOMAIN_TO_FIND == null
? __addToBlackList
: (domainToBeAddedToBlack, isSubDomain) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
__addToBlackList(domainToBeAddedToBlack, isSubDomain);
};
const __addToWhiteList = (domainToBeAddedToWhite) => {
whitelistDomainSets.add(domainToBeAddedToWhite);
};
const addToWhiteList = DEBUG_DOMAIN_TO_FIND == null
? __addToWhiteList
: (domainToBeAddedToWhite) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
whitelistDomainSets.add(domainToBeAddedToWhite);
__addToWhiteList(domainToBeAddedToWhite);
};
let downloadTime = 0;
const lineCb = (line) => {
const result = parse(line, includeThirdParties);
if (result) {
const flag = result[1];
const hostname = result[0];
switch (flag) {
case 0:
addToWhiteList(hostname);
break;
case 1:
addToBlackList(hostname, false);
break;
case 2:
addToBlackList(hostname, true);
break;
default:
throw new Error(`Unknown flag: ${flag}`);
}
}
};
if (!fallbackUrls || fallbackUrls.length === 0) {
const downloadStart = performance.now();
for await (const line of await fetchRemoteTextAndCreateReadlineInterface(filterRulesUrl)) {
lineCb(line.trim());
}
downloadTime = performance.now() - downloadStart;
} else {
let filterRules;
const downloadStart = performance.now();
try {
const controller = new AbortController();
const signal = controller.signal;
@ -150,11 +198,31 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
console.log(`Download Rule for [${filterRulesUrl}] failed`);
throw e;
}
let hasParseFailed = false;
downloadTime = performance.now() - downloadStart;
for (let i = 0, len = filterRules.length; i < len; i++) {
const line = filterRules[i].trim();
lineCb(line);
}
}
console.log(` ┬ processFilterRules (${filterRulesUrl}): ${(performance.now() - runStart).toFixed(3)}ms`);
console.log(` └── download time: ${downloadTime.toFixed(3)}ms`);
return {
white: whitelistDomainSets,
black: blacklistDomainSets,
foundDebugDomain
};
}
/**
* @param {string} $line
* @param {boolean} includeThirdParties
* @returns {null | [string, 0 | 1 | 2]} - 0 white, 1 black abosulte, 2 black include subdomain
*/
function parse($line, includeThirdParties) {
const line = $line.trim();
if (
line === ''
@ -187,7 +255,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
// || line.includes('$removeparam')
// || line.includes('$popunder')
) {
continue;
return null;
}
const filter = NetworkFilter.parse(line);
@ -203,7 +271,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
|| (!filter.fromAny() && !filter.fromDocument())
) {
// not supported type
continue;
return null;
}
if (
@ -215,30 +283,28 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
const hostname = normalizeDomain(filter.getHostname());
if (hostname) {
if (filter.isException() || filter.isBadFilter()) {
addToWhiteList(hostname);
continue;
return [hostname, 0];
}
if (filter.firstParty() === filter.thirdParty()) {
addToBlackList(hostname, true);
continue;
return [hostname, 2];
}
if (filter.thirdParty()) {
if (includeThirdParties) {
addToBlackList(hostname, true);
return [hostname, 2];
}
continue;
return null;
}
if (filter.firstParty()) {
continue;
return null;
}
} else {
continue;
return null;
}
}
}
if (line.includes('$third-party') || line.includes('$frame')) {
continue;
return null;
}
const lineEndsWithCaret = line.endsWith('^');
@ -246,7 +312,7 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
if (line[0] === '@' && line[1] === '@') {
if (line.endsWith('$cname')) {
continue;
return null;
}
if (
@ -276,12 +342,11 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
const domain = normalizeDomain(_domain);
if (domain) {
addToWhiteList(domain);
} else {
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
return [domain, 0];
}
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
continue;
return null;
}
}
@ -302,11 +367,11 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
return [domain, 2];
}
continue;
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
return null;
}
const lineStartsWithSingleDot = line.startsWith('.');
@ -325,11 +390,11 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
return [domain, 2];
}
continue;
console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
return null;
}
if (
(
@ -356,21 +421,21 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
return [domain, 1];
}
continue;
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
return null;
}
if (line[0] !== '|' && lineEndsWithCaret) {
const _domain = line.slice(0, -1);
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
return [domain, 1];
}
continue;
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
return null;
}
const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
if (
@ -381,26 +446,14 @@ async function processFilterRules(filterRulesUrl, fallbackUrls, includeThirdPart
: tryNormalizeDomain === line
)
) {
addToBlackList(line, true);
continue;
return [line, 2];
}
if (
!line.endsWith('.js')
) {
hasParseFailed = true;
if (!line.endsWith('.js')) {
console.warn(' * [parse-filter E0010] can not parse:', line);
}
}
console.timeEnd(` - processFilterRules: ${filterRulesUrl}`);
return {
white: whitelistDomainSets,
black: blacklistDomainSets,
foundDebugDomain,
parseFailed: hasParseFailed
};
return null;
}
module.exports.processDomainLists = processDomainLists;

View File

@ -0,0 +1,11 @@
const domainSorter = require('./stable-sort-domain');
const chai = require('chai');
const { describe, it } = require('mocha');
chai.should();
describe('stable-sort-domain', () => {
it('.ks.cn, .tag.unclaimedproperty.ks.gov', () => {
domainSorter('.ks.cn', '.tag.unclaimedproperty.ks.gov').should.eql(-1);
});
});

View File

@ -278,7 +278,7 @@ class Trie {
* Static .from function taking an arbitrary iterable & converting it into
* a trie.
*
* @param {string[]} iterable - Target iterable.
* @param {string[] | Set<string>} iterable - Target iterable.
* @return {Trie}
*/
static from = iterable => {

View File

@ -2,6 +2,7 @@ require('chai').should();
const Trie = require('./trie');
const assert = require('assert');
const { describe, it } = require('mocha');
describe('Trie', () => {
it('should be possible to add items to a Trie.', () => {