Refactor: speed up reject parsing

This commit is contained in:
SukkaW 2023-12-17 23:37:35 +08:00
parent 91ed783d73
commit 16a08bd07d
13 changed files with 175 additions and 159 deletions

View File

@ -58,7 +58,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
}
};
const [gorhill] = await Promise.all([
const gorhill = (await Promise.all([
getGorhillPublicSuffixPromise(),
processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/cdn.conf')),
processLocalRuleSet(path.resolve(import.meta.dir, '../List/non_ip/global.conf')),
@ -70,7 +70,7 @@ export const buildInternalCDNDomains = task(import.meta.path, async () => {
processLocalDomainSet(path.resolve(import.meta.dir, '../List/domainset/download.conf')),
fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
]);
]))[0];
return compareAndWriteFile(
[

View File

@ -5,10 +5,10 @@ import { task } from './lib/trace-runner';
import { compareAndWriteFile } from './lib/create-file';
export const buildInternalChnDomains = task(import.meta.path, async () => {
const [result] = await Promise.all([
const result = (await Promise.all([
parseFelixDnsmasq('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf'),
fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
]);
]))[0];
return compareAndWriteFile(
result.map(line => `SUFFIX,${line}`),

View File

@ -25,10 +25,10 @@ const RESERVED_IPV4_CIDR = [
];
export const buildInternalReverseChnCIDR = task(import.meta.path, async () => {
const [cidr] = await Promise.all([
const cidr = (await Promise.all([
processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')),
fsp.mkdir(path.resolve(import.meta.dir, '../List/internal'), { recursive: true })
]);
]))[0];
const reversedCidr = exclude(
[

View File

@ -16,10 +16,10 @@ const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-ag
.then(res => res.json() as Promise<string[]>);
const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => {
const [topUserAgents] = await Promise.all([
const topUserAgents = (await Promise.all([
latestTopUserAgentsPromise,
s.acquire()
]);
]))[0];
const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];

View File

@ -53,10 +53,10 @@ export const downloadPreviousBuild = task(import.meta.path, async () => {
await traceAsync(
'Download and extract previous build',
async () => {
const [resp] = await Promise.all([
const resp = (await Promise.all([
fetchWithRetry('https://codeload.github.com/sukkalab/ruleset.skk.moe/tar.gz/master', defaultRequestInit),
fsp.mkdir(extractedPath, { recursive: true })
]);
]))[0];
const extract = tarStream.extract();
Readable.fromWeb(resp.body!).pipe(zlib.createGunzip()).pipe(extract);
@ -88,10 +88,10 @@ export const downloadPublicSuffixList = task(import.meta.path, async () => {
const publicSuffixDir = path.resolve(import.meta.dir, '../node_modules/.cache');
const publicSuffixPath = path.join(publicSuffixDir, 'public_suffix_list_dat.txt');
const [resp] = await Promise.all([
const resp = (await Promise.all([
fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit),
fsp.mkdir(publicSuffixDir, { recursive: true })
]);
]))[0];
return Bun.write(publicSuffixPath, resp as Response);
}, 'download-publicsuffixlist');

View File

@ -1,6 +1,6 @@
import * as tldts from 'tldts';
import { createCache } from './cache-apply';
import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
const cache = createCache('cached-tld-parse', true);

61
Build/lib/fetch-assets.ts Normal file
View File

@ -0,0 +1,61 @@
import picocolors from 'picocolors';
import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
class CustomAbortError extends Error {
public readonly name = 'AbortError';
public readonly digest = 'AbortError';
}
const sleepWithAbort = (ms: number, signal: AbortSignal) => new Promise<void>((resolve, reject) => {
signal.throwIfAborted();
signal.addEventListener('abort', stop);
Bun.sleep(ms).then(done).catch(doReject);
function done() {
signal.removeEventListener('abort', stop);
resolve();
}
function stop(this: AbortSignal) {
reject(this.reason);
}
function doReject(reason: unknown) {
signal.removeEventListener('abort', stop);
reject(reason);
}
});
export async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
const controller = new AbortController();
const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
.then(r => r.text())
.then(text => {
controller.abort();
return text;
});
const createFetchFallbackPromise = async (url: string, index: number) => {
// Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 500ms before downloading from the fallback URL.
try {
await sleepWithAbort(500 + (index + 1) * 20, controller.signal);
} catch {
console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
throw new CustomAbortError();
}
if (controller.signal.aborted) {
console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
throw new CustomAbortError();
}
const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
const text = await res.text();
controller.abort();
return text;
};
return Promise.any([
fetchMainPromise,
...fallbackUrls.map(createFetchFallbackPromise)
]).catch(e => {
console.log(`Download Rule for [${url}] failed`);
throw e;
});
}

View File

@ -2,7 +2,7 @@ import { toASCII } from 'punycode';
import path from 'path';
import { traceAsync } from './trace-runner';
import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
const publicSuffixPath = path.resolve(import.meta.dir, '../../node_modules/.cache/public_suffix_list_dat.txt');
@ -18,7 +18,7 @@ const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix in
console.log('public_suffix_list.dat not found, fetch directly from remote.');
return r.text();
}),
import('gorhill-publicsuffixlist')
import('@gorhill/publicsuffixlist')
]);
gorhill.parse(publicSuffixListDat, toASCII);

View File

@ -1,21 +1,20 @@
// @ts-check
import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
import { NetworkFilter } from '@cliqz/adblocker';
import { processLine } from './process-line';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
import { traceAsync } from './trace-runner';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
const warnOnceUrl = new Set<string>();
const warnOnce = (url: string, isWhite: boolean, ...message: any[]) => {
const warnOnce = (url: string, isWhite: boolean, ...message: string[]) => {
const key = `${url}${isWhite ? 'white' : 'black'}`;
if (warnOnceUrl.has(key)) {
return;
@ -54,7 +53,7 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
continue;
}
const [, domain] = line.split(/\s/);
const domain = line.split(/\s/)[1];
if (!domain) {
continue;
}
@ -185,7 +184,9 @@ export async function processFilterRules(
}
const R_KNOWN_NOT_NETWORK_FILTER_PATTERN = /[#%&=~]/;
const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder)/;
const R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2 = /(\$popup|\$removeparam|\$popunder|\$cname)/;
// cname exceptional filter can not be parsed by NetworkFilter
// Surge / Clash can't handle CNAME either, so we just ignore them
function parse($line: string, gorhill: PublicSuffixList): null | [hostname: string, flag: ParseType] {
if (
@ -213,15 +214,15 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
return null;
}
const firstChar = line[0];
const lastChar = line[len - 1];
const firstCharCode = line[0].charCodeAt(0);
const lastCharCode = line[len - 1].charCodeAt(0);
if (
firstChar === '/'
firstCharCode === 47 // 47 `/`
// ends with
|| lastChar === '.' // || line.endsWith('.')
|| lastChar === '-' // || line.endsWith('-')
|| lastChar === '_' // || line.endsWith('_')
|| lastCharCode === 46 // 46 `.`, line.endsWith('.')
|| lastCharCode === 45 // 45 `-`, line.endsWith('-')
|| lastCharCode === 95 // 95 `_`, line.endsWith('_')
// special modifier
|| R_KNOWN_NOT_NETWORK_FILTER_PATTERN_2.test(line)
// || line.includes('$popup')
@ -238,6 +239,8 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
const filter = NetworkFilter.parse(line);
if (filter) {
if (
// filter.isCosmeticFilter() // always false
// filter.isNetworkFilter() // always true
filter.isElemHide()
|| filter.isGenericHide()
|| filter.isSpecificHide()
@ -253,8 +256,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
if (
filter.hostname // filter.hasHostname() // must have
&& filter.isPlain()
// && (!filter.isRegex()) // isPlain() === !isRegex()
&& filter.isPlain() // isPlain() === !isRegex()
&& (!filter.isFullRegex())
) {
const hostname = normalizeDomain(filter.hostname);
@ -286,95 +288,106 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
}
}
/**
* abnormal filter that can not be parsed by NetworkFilter
*/
// After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
// We now need to "salvage" the line as much as possible
if (line.includes('$third-party') || line.includes('$frame')) {
/*
* From now on, we are mostly facing non-standard domain rules (some are regex like)
* We first skip third-party and frame rules, as Surge / Clash can't handle them
*
* `.sharecounter.$third-party`
* `.bbelements.com^$third-party`
* `://o0e.ru^$third-party`
* `.1.1.1.l80.js^$third-party`
*/
if (line.includes('$third-party') || line.includes('$frame')) {
return null;
}
/** @example line.endsWith('^') */
const linedEndsWithCaret = lastChar === '^';
const linedEndsWithCaret = lastCharCode === 94; // lastChar === '^';
/** @example line.endsWith('^|') */
const lineEndsWithCaretVerticalBar = lastChar === '|' && line[len - 2] === '^';
const lineEndsWithCaretVerticalBar = (lastCharCode === 124 /** lastChar === '|' */) && line[len - 2] === '^';
/** @example line.endsWith('^') || line.endsWith('^|') */
const lineEndsWithCaretOrCaretVerticalBar = linedEndsWithCaret || lineEndsWithCaretVerticalBar;
// whitelist (exception)
if (firstChar === '@' && line[1] === '@') {
/**
* cname exceptional filter can not be parsed by NetworkFilter
*
* `@@||m.faz.net^$cname`
*
* Surge / Clash can't handle CNAME either, so we just ignore them
*/
if (line.endsWith('$cname')) {
return null;
}
if (
firstCharCode === 64 // 64 `@`
&& line[1] === '@'
) {
/**
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
* "$genericblock`" is also not supported by NetworkFilter
* "$genericblock`" is also not supported by NetworkFilter, see:
* https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
*
* `@@||cmechina.net^$genericblock`
* `@@|ftp.bmp.ovh^|`
* `@@|adsterra.com^|`
* `@@.atlassian.net$document`
* `@@||ad.alimama.com^$genericblock`
*/
if (
(
// line.startsWith('@@|')
line[2] === '|'
// line.startsWith('@@.')
|| line[2] === '.'
let sliceStart = 0;
let sliceEnd: number | undefined;
// line.startsWith('@@|') || line.startsWith('@@.')
if (line[2] === '|' || line[2] === '.') {
sliceStart = 3;
// line.startsWith('@@||')
if (line[3] === '|') {
sliceStart = 4;
}
}
/**
* line.startsWith('@@://')
*
* `@@://googleadservices.com^|`
* `@@://www.googleadservices.com^|`
*/
|| (line[2] === ':' && line[3] === '/' && line[4] === '/')
)
&& (
lineEndsWithCaretOrCaretVerticalBar
|| line.endsWith('$genericblock')
|| line.endsWith('$document')
)
) {
const _domain = line
.replace('@@||', '')
.replace('@@://', '')
.replace('@@|', '')
.replace('@@.', '')
.replace('^|', '')
.replace('^$genericblock', '')
.replace('$genericblock', '')
.replace('^$document', '')
.replace('$document', '')
.replaceAll('^', '')
.trim();
if (line[2] === ':' && line[3] === '/' && line[4] === '/') {
sliceStart = 5;
}
const domain = normalizeDomain(_domain);
if (lineEndsWithCaretOrCaretVerticalBar) {
sliceEnd = -2;
} else if (line.endsWith('$genericblock')) {
sliceEnd = -13;
if (line[len - 14] === '^') { // line.endsWith('^$genericblock')
sliceEnd = -14;
}
} else if (line.endsWith('$document')) {
sliceEnd = -9;
if (line[len - 10] === '^') { // line.endsWith('^$document')
sliceEnd = -10;
}
}
if (sliceStart !== 0 || sliceEnd !== undefined) {
const sliced = line.slice(sliceStart, sliceEnd);
const domain = normalizeDomain(sliced);
if (domain) {
return [domain, ParseType.WhiteIncludeSubdomain];
}
return [
`[parse-filter E0001] (white) invalid domain: ${_domain}`,
`[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd
})}`,
ParseType.ErrorMessage
];
}
return [
`[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
line, sliceStart, sliceEnd
})}`,
ParseType.ErrorMessage
];
}
if (firstChar === '|') {
const lineEndswithCname = line.endsWith('$cname');
if (lineEndsWithCaretOrCaretVerticalBar || lineEndswithCname) {
if (firstCharCode === 124) { // 124 `|`
if (lineEndsWithCaretOrCaretVerticalBar) {
/**
* Some malformed filters can not be parsed by NetworkFilter:
*
@ -387,12 +400,11 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
const includeAllSubDomain = line[1] === '|';
const sliceStart = includeAllSubDomain ? 2 : 1;
const sliceEnd = lastChar === '^'
const sliceEnd = lastCharCode === 94 // lastChar === '^'
? -1
: lineEndsWithCaretOrCaretVerticalBar
: (lineEndsWithCaretVerticalBar
? -2
// eslint-disable-next-line sukka/unicorn/no-nested-ternary -- speed
: (lineEndswithCname ? -6 : 0);
: undefined);
const _domain = line
.slice(sliceStart, sliceEnd) // we already make sure line startsWith "|"
@ -410,7 +422,7 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
}
}
const lineStartsWithSingleDot = firstChar === '.';
const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
if (
lineStartsWithSingleDot
&& lineEndsWithCaretOrCaretVerticalBar
@ -489,7 +501,10 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
* `-logging.nextmedia.com`
* `_social_tracking.js^`
*/
if (firstChar !== '|' && lastChar === '^') {
if (
firstCharCode !== 124 // 124 `|`
&& lastCharCode === 94 // 94 `^`
) {
const _domain = line.slice(0, -1);
const suffix = gorhill.getPublicSuffix(_domain);
@ -553,63 +568,3 @@ function parse($line: string, gorhill: PublicSuffixList): null | [hostname: stri
ParseType.ErrorMessage
];
}
class CustomAbortError extends Error {
public readonly name = 'AbortError';
public readonly digest = 'AbortError';
}
const sleepWithAbort = (ms: number, signal: AbortSignal) => new Promise<void>((resolve, reject) => {
signal.throwIfAborted();
signal.addEventListener('abort', stop);
Bun.sleep(ms).then(done).catch(doReject);
function done() {
signal.removeEventListener('abort', stop);
resolve();
}
function stop(this: AbortSignal) {
reject(this.reason);
}
function doReject(reason: unknown) {
signal.removeEventListener('abort', stop);
reject(reason);
}
});
async function fetchAssets(url: string, fallbackUrls: string[] | readonly string[]) {
const controller = new AbortController();
const fetchMainPromise = fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit })
.then(r => r.text())
.then(text => {
console.log(picocolors.gray('[fetch finish]'), picocolors.gray(url));
controller.abort();
return text;
});
const createFetchFallbackPromise = async (url: string, index: number) => {
// Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 350ms before downloading from the fallback URL.
try {
await sleepWithAbort(300 + (index + 1) * 20, controller.signal);
} catch {
console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
throw new CustomAbortError();
}
if (controller.signal.aborted) {
console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
throw new CustomAbortError();
}
const res = await fetchWithRetry(url, { signal: controller.signal, ...defaultRequestInit });
const text = await res.text();
controller.abort();
return text;
};
return Promise.any([
fetchMainPromise,
...fallbackUrls.map(createFetchFallbackPromise)
]).catch(e => {
console.log(`Download Rule for [${url}] failed`);
throw e;
});
}

View File

@ -1,4 +1,4 @@
import type { PublicSuffixList } from 'gorhill-publicsuffixlist';
import type { PublicSuffixList } from '@gorhill/publicsuffixlist';
import { createCachedGorhillGetDomain } from './cached-tld-parse';
const compare = (a: string | null, b: string | null) => {

2
Build/mod.d.ts vendored
View File

@ -1,4 +1,4 @@
declare module 'gorhill-publicsuffixlist' {
declare module '@gorhill/publicsuffixlist' {
type Selfie =
| string
| {

BIN
bun.lockb

Binary file not shown.

View File

@ -15,13 +15,13 @@
"license": "ISC",
"dependencies": {
"@cliqz/adblocker": "^1.26.12",
"@gorhill/publicsuffixlist": "^3.0.1",
"@sukka/listdir": "^0.3.1",
"async-retry": "^1.3.3",
"async-sema": "^3.1.1",
"ci-info": "^4.0.0",
"csv-parse": "^5.5.3",
"fast-cidr-tools": "^0.2.2",
"gorhill-publicsuffixlist": "github:gorhill/publicsuffixlist.js",
"mnemonist": "^0.39.6",
"path-scurry": "^1.10.1",
"picocolors": "^1.0.0",