Refactor: fetch directly

This commit is contained in:
SukkaW 2025-01-11 22:46:13 +08:00
parent b426f666e2
commit 6895c7ee02
4 changed files with 115 additions and 392 deletions

View File

@ -4,11 +4,11 @@ import { createReadlineInterfaceFromResponse, readFileIntoProcessedArray } from
import { task } from './trace';
import { SHARED_DESCRIPTION } from './constants/description';
import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip';
import { fsFetchCache, getFileContentHash } from './lib/cache-filesystem';
import { processLine } from './lib/process-line';
import { RulesetOutput } from './lib/create-file';
import { SOURCE_DIR } from './constants/dir';
import { $$fetch } from './lib/fetch-retry';
import { fetchAssetsWithout304 } from './lib/fetch-assets';
const BOGUS_NXDOMAIN_URL = 'https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf';
const getBogusNxDomainIPsPromise: Promise<[ipv4: string[], ipv6: string[]]> = $$fetch(BOGUS_NXDOMAIN_URL).then(async (resp) => {
@ -37,26 +37,17 @@ const BOTNET_FILTER_MIRROR_URL = [
// https://curbengh.github.io/malware-filter/botnet-filter-dnscrypt-blocked-ips.txt
];
const getBotNetFilterIPsPromise = fsFetchCache.applyWithHttp304AndMirrors<[ipv4: string[], ipv6: string[]]>(
BOTNET_FILTER_URL,
BOTNET_FILTER_MIRROR_URL,
getFileContentHash(__filename),
(text) => text.split('\n').reduce<[ipv4: string[], ipv6: string[]]>((acc, cur) => {
const ip = processLine(cur);
if (ip) {
if (isProbablyIpv4(ip)) {
acc[0].push(ip);
} else if (isProbablyIpv6(ip)) {
acc[1].push(ip);
}
const getBotNetFilterIPsPromise: Promise<[ipv4: string[], ipv6: string[]]> = fetchAssetsWithout304(BOTNET_FILTER_URL, BOTNET_FILTER_MIRROR_URL).then(text => text.split('\n').reduce<[ipv4: string[], ipv6: string[]]>((acc, cur) => {
const ip = processLine(cur);
if (ip) {
if (isProbablyIpv4(ip)) {
acc[0].push(ip);
} else if (isProbablyIpv6(ip)) {
acc[1].push(ip);
}
return acc;
}, [[], []]),
{
serializer: JSON.stringify,
deserializer: JSON.parse
}
);
return acc;
}, [[], []]));
export const buildRejectIPList = task(require.main === module, __filename)(async (span) => {
const [bogusNxDomainIPs, botNetIPs] = await Promise.all([

View File

@ -4,20 +4,12 @@ import os from 'node:os';
import path from 'node:path';
import { mkdirSync } from 'node:fs';
import picocolors from 'picocolors';
import { mergeHeaders } from 'foxts/merge-headers';
import { headersToObject } from 'foxts/headers-to-object';
import { identity } from 'foxts/identity';
import { fastStringArrayJoin } from 'foxts/fast-string-array-join';
import { performance } from 'node:perf_hooks';
import fs from 'node:fs';
import { simpleStringHash } from 'foxts/simple-string-hash';
import { defaultRequestInit, requestWithLog, ResponseError } from './fetch-retry';
import type { UndiciResponseData } from './fetch-retry';
// import type { UndiciResponseData } from './fetch-retry';
import { Custom304NotModifiedError, CustomAbortError, CustomNoETagFallbackError, fetchAssetsWithout304, sleepWithAbort } from './fetch-assets';
import type { IncomingHttpHeaders } from 'undici/types/header';
import { Headers } from 'undici';
import { CACHE_DIR } from '../constants/dir';
export interface CacheOptions<S = string> {
@ -69,20 +61,6 @@ export const TTL = {
TWO_WEEKS: () => randomInt(10, 14) * ONE_DAY
};
function ensureETag(headers: IncomingHttpHeaders | Headers) {
if (headers instanceof Headers && headers.has('etag')) {
return headers.get('etag');
}
if ('etag' in headers && typeof headers.etag === 'string' && headers.etag.length > 0) {
return headers.etag;
}
if ('ETag' in headers && typeof headers.ETag === 'string' && headers.ETag.length > 0) {
return headers.ETag;
}
return null;
}
export class Cache<S = string> {
private db: Database;
/** Time before deletion */
@ -199,238 +177,18 @@ export class Cache<S = string> {
this.statement.del.run(key);
}
async applyWithHttp304<T>(
url: string,
extraCacheKey: string,
fn: (resp: UndiciResponseData) => Promise<T>,
opt: Omit<CacheApplyOption<T, S>, 'incrementTtlWhenHit'>
// requestInit?: RequestInit
): Promise<T> {
if (opt.temporaryBypass) {
return fn(await requestWithLog(url));
}
const baseKey = url + '$' + extraCacheKey;
const etagKey = baseKey + '$etag';
const cachedKey = baseKey + '$cached';
const etag = this.get(etagKey);
const onMiss = async (resp: UndiciResponseData) => {
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const value = await fn(resp);
let serverETag = ensureETag(resp.headers);
if (serverETag) {
// FUCK someonewhocares.org
if (url.includes('someonewhocares.org')) {
serverETag = serverETag.replace('-gzip', '');
}
console.log(picocolors.yellow('[cache] miss'), url, { status: resp.statusCode, cachedETag: etag, serverETag });
this.set(etagKey, serverETag, TTL.ONE_WEEK_STATIC);
this.set(cachedKey, serializer(value), TTL.ONE_WEEK_STATIC);
return value;
}
this.del(etagKey);
console.log(picocolors.red('[cache] no etag'), picocolors.gray(url));
if (opt.ttl) {
this.set(cachedKey, serializer(value), opt.ttl);
}
return value;
};
const cached = this.get(cachedKey);
if (cached == null) {
return onMiss(await requestWithLog(url));
}
const resp = await requestWithLog(
url,
{
...defaultRequestInit,
headers: (typeof etag === 'string' && etag.length > 0)
? headersToObject(mergeHeaders(defaultRequestInit.headers, { 'If-None-Match': etag }))
: defaultRequestInit.headers
}
);
// Only miss if previously a ETag was present and the server responded with a 304
if (!ensureETag(resp.headers) && resp.statusCode !== 304) {
return onMiss(resp);
}
console.log(picocolors.green(`[cache] ${resp.statusCode === 304 ? 'http 304' : 'cache hit'}`), picocolors.gray(url));
this.updateTtl(cachedKey, TTL.ONE_WEEK_STATIC);
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
return deserializer(cached);
}
async applyWithHttp304AndMirrors<T>(
primaryUrl: string,
mirrorUrls: string[],
extraCacheKey: string,
fn: (resp: string) => Promise<T> | T,
opt: Omit<CacheApplyOption<T, S>, 'incrementTtlWhenHit'>
): Promise<T> {
if (opt.temporaryBypass) {
return fn(await fetchAssetsWithout304(primaryUrl, mirrorUrls));
}
const baseKey = primaryUrl + '$' + extraCacheKey;
const getETagKey = (url: string) => baseKey + '$' + url + '$etag';
const cachedKey = baseKey + '$cached';
const controller = new AbortController();
const previouslyCached = this.get(cachedKey);
const createFetchFallbackPromise = async (url: string, index: number) => {
// Most assets can be downloaded within 250ms. To avoid wasting bandwidth, we will wait for 500ms before downloading from the fallback URL.
if (index >= 0) {
try {
await sleepWithAbort(50 + (index + 1) * 100, controller.signal);
} catch {
console.log(picocolors.gray('[fetch cancelled early]'), picocolors.gray(url));
throw new CustomAbortError();
}
if (controller.signal.aborted) {
console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
throw new CustomAbortError();
}
}
const etag = this.get(getETagKey(url));
const res = await requestWithLog(
url,
{
signal: controller.signal,
...defaultRequestInit,
headers: (typeof etag === 'string' && etag.length > 0 && typeof previouslyCached === 'string' && previouslyCached.length > 1)
? headersToObject(mergeHeaders(defaultRequestInit.headers, { 'If-None-Match': etag }))
: defaultRequestInit.headers
}
);
const serverETag = ensureETag(res.headers);
if (serverETag) {
this.set(getETagKey(url), serverETag, TTL.ONE_WEEK_STATIC);
}
// If we do not have a cached value, we ignore 304
if (res.statusCode === 304 && typeof previouslyCached === 'string' && previouslyCached.length > 1) {
const err = new Custom304NotModifiedError(url, previouslyCached);
controller.abort(err);
throw err;
}
if (!serverETag && !this.get(getETagKey(primaryUrl)) && typeof previouslyCached === 'string') {
const err = new CustomNoETagFallbackError(previouslyCached);
controller.abort(err);
throw err;
}
// either no etag and not cached
// or has etag but not 304
const text = await res.body.text();
if (text.length < 2) {
throw new ResponseError(res, url, 'empty response');
}
controller.abort();
return text;
};
try {
const text = mirrorUrls.length === 0
? await createFetchFallbackPromise(primaryUrl, -1)
: await Promise.any([
createFetchFallbackPromise(primaryUrl, -1),
...mirrorUrls.map(createFetchFallbackPromise)
]);
console.log(picocolors.yellow('[cache] miss'), primaryUrl);
const serializer = 'serializer' in opt ? opt.serializer : identity as any;
const value = await fn(text);
this.set(cachedKey, serializer(value), opt.ttl ?? TTL.ONE_WEEK_STATIC);
return value;
} catch (e) {
const deserializer = 'deserializer' in opt ? opt.deserializer : identity as any;
const on304 = (error: Custom304NotModifiedError): NonNullable<T> => {
console.log(picocolors.green('[cache] http 304'), picocolors.gray(primaryUrl));
this.updateTtl(cachedKey, TTL.ONE_WEEK_STATIC);
return deserializer(error.data);
};
const onNoETagFallback = (error: CustomNoETagFallbackError): NonNullable<T> => {
console.log(picocolors.green('[cache] hit'), picocolors.gray(primaryUrl));
return deserializer(error.data);
};
const onSingleError = (error: object & {}) => {
if ('name' in error) {
if (error.name === 'Custom304NotModifiedError') {
return on304(error as Custom304NotModifiedError);
}
if (error.name === 'CustomNoETagFallbackError') {
return onNoETagFallback(error as CustomNoETagFallbackError);
}
if (error.name === 'CustomAbortError' || error.name === 'AbortError') {
// noop
}
}
if ('digest' in error) {
if (error.digest === 'Custom304NotModifiedError') {
return on304(error as Custom304NotModifiedError);
}
if (error.digest === 'CustomNoETagFallbackError') {
return onNoETagFallback(error as CustomNoETagFallbackError);
}
}
return null;
};
if (e && typeof e === 'object') {
if ('errors' in e && Array.isArray(e.errors)) {
for (let i = 0, len = e.errors.length; i < len; i++) {
const error = e.errors[i];
const result = onSingleError(error);
if (result !== null) {
return result;
}
console.log(picocolors.red('[fetch error 1]'), picocolors.gray(`[${primaryUrl}]`), error);
}
} else {
const result = onSingleError(e);
if (result !== null) {
return result;
}
console.log(picocolors.red('[fetch error 2]'), picocolors.gray(`[${primaryUrl}]`), e);
}
}
console.log(`Download Rule for [${primaryUrl}] failed`, { e, name: (e as any).name });
throw e;
}
}
destroy() {
this.db.close();
}
deleteTable(tableName: string) {
this.db.exec(`DROP TABLE IF EXISTS ${tableName};`);
}
}
export const fsFetchCache = new Cache({ cachePath: CACHE_DIR });
// drop deprecated cache
new Cache({ cachePath: CACHE_DIR }).deleteTable('cache');
// process.on('exit', () => {
// fsFetchCache.destroy();
// });

View File

@ -1,5 +1,5 @@
import picocolors from 'picocolors';
import { defaultRequestInit, requestWithLog, ResponseError } from './fetch-retry';
import { $$fetch, defaultRequestInit, ResponseError } from './fetch-retry';
import { wait } from 'foxts/wait';
// eslint-disable-next-line sukka/unicorn/custom-error-definition -- typescript is better
@ -41,7 +41,7 @@ export function sleepWithAbort(ms: number, signal: AbortSignal) {
});
}
export async function fetchAssetsWithout304(url: string, fallbackUrls: string[] | readonly string[]) {
export async function fetchAssetsWithout304(url: string, fallbackUrls: null | undefined | string[] | readonly string[]) {
const controller = new AbortController();
const createFetchFallbackPromise = async (url: string, index: number) => {
@ -58,8 +58,8 @@ export async function fetchAssetsWithout304(url: string, fallbackUrls: string[]
console.log(picocolors.gray('[fetch cancelled]'), picocolors.gray(url));
throw new CustomAbortError();
}
const res = await requestWithLog(url, { signal: controller.signal, ...defaultRequestInit });
const text = await res.body.text();
const res = await $$fetch(url, { signal: controller.signal, ...defaultRequestInit });
const text = await res.text();
if (text.length < 2) {
throw new ResponseError(res, url, 'empty response w/o 304');
@ -69,6 +69,9 @@ export async function fetchAssetsWithout304(url: string, fallbackUrls: string[]
return text;
};
if (!fallbackUrls || fallbackUrls.length === 0) {
return createFetchFallbackPromise(url, -1);
}
return Promise.any([
createFetchFallbackPromise(url, -1),
...fallbackUrls.map(createFetchFallbackPromise)

View File

@ -4,16 +4,14 @@ import tldts from 'tldts-experimental';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import { deserializeArray, fsFetchCache, serializeArray, getFileContentHash } from './cache-filesystem';
import type { Span } from '../trace';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { looseTldtsOpt } from '../constants/loose-tldts-opt';
import { identity } from 'foxts/identity';
import { DEBUG_DOMAIN_TO_FIND } from '../constants/reject-data-source';
import { noop } from 'foxts/noop';
import { fetchAssetsWithout304 } from './fetch-assets';
let foundDebugDomain = false;
const temporaryBypass = typeof DEBUG_DOMAIN_TO_FIND === 'string';
const onBlackFound = DEBUG_DOMAIN_TO_FIND
? (line: string, meta: string) => {
@ -58,32 +56,24 @@ function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean
export function processDomainLists(
span: Span,
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
ttl: number | null = null, extraCacheKey: (input: string) => string = identity
domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.applyWithHttp304AndMirrors<string[]>(
domainListsUrl,
mirrors ?? [],
extraCacheKey(getFileContentHash(__filename)),
(text) => {
const domainSets: string[] = [];
const filterRules = text.split('\n');
return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => {
const text = await span.traceChildAsync(`process domainlist: ${domainListsUrl}`, () => fetchAssetsWithout304(
domainListsUrl,
mirrors
));
const domainSets: string[] = [];
const filterRules = text.split('\n');
childSpan.traceChild('parse domain list').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});
span.traceChildSync('parse domain list', () => {
for (let i = 0, len = filterRules.length; i < len; i++) {
domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl);
}
});
return domainSets;
},
{
ttl,
temporaryBypass,
serializer: serializeArray,
deserializer: deserializeArray
}
));
return domainSets;
});
}
function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) {
@ -108,33 +98,23 @@ function hostsLineCb(l: string, set: string[], includeAllSubDomain: boolean, met
export function processHosts(
span: Span,
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false,
ttl: number | null = null, extraCacheKey: (input: string) => string = identity
hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false
) {
return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.applyWithHttp304AndMirrors<string[]>(
hostsUrl,
mirrors ?? [],
extraCacheKey(getFileContentHash(__filename)),
(text) => {
const domainSets: string[] = [];
return span.traceChildAsync(`process hosts: ${hostsUrl}`, async (span) => {
const text = await span.traceChild('download').traceAsyncFn(() => fetchAssetsWithout304(hostsUrl, mirrors));
const filterRules = text.split('\n');
const domainSets: string[] = [];
childSpan.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});
const filterRules = text.split('\n');
return domainSets;
},
{
ttl,
temporaryBypass,
serializer: serializeArray,
deserializer: deserializeArray
}
));
span.traceChild('parse hosts').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
hostsLineCb(filterRules[i], domainSets, includeAllSubDomain, hostsUrl);
}
});
return domainSets;
});
}
const enum ParseType {
@ -153,92 +133,83 @@ export async function processFilterRules(
parentSpan: Span,
filterRulesUrl: string,
fallbackUrls?: string[] | null,
ttl: number | null = null,
_ttl: number | null = null,
allowThirdParty = false
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn((span) => fsFetchCache.applyWithHttp304AndMirrors<Readonly<[white: string[], black: string[], warningMessages: string[]]>>(
filterRulesUrl,
fallbackUrls ?? [],
getFileContentHash(__filename),
(text) => {
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
const [white, black, warningMessages] = await parentSpan.traceChild(`process filter rules: ${filterRulesUrl}`).traceAsyncFn(async (span) => {
const text = await fetchAssetsWithout304(filterRulesUrl, fallbackUrls);
const warningMessages: string[] = [];
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', ParseType.NotParsed];
/**
const warningMessages: string[] = [];
const MUTABLE_PARSE_LINE_RESULT: [string, ParseType] = ['', ParseType.NotParsed];
/**
* @param {string} line
*/
const lineCb = (line: string) => {
const result = parse(line, MUTABLE_PARSE_LINE_RESULT, allowThirdParty);
const flag = result[1];
const lineCb = (line: string) => {
const result = parse(line, MUTABLE_PARSE_LINE_RESULT, allowThirdParty);
const flag = result[1];
if (flag === ParseType.NotParsed) {
throw new Error(`Didn't parse line: ${line}`);
}
if (flag === ParseType.Null) {
return;
}
if (flag === ParseType.NotParsed) {
throw new Error(`Didn't parse line: ${line}`);
}
if (flag === ParseType.Null) {
return;
}
const hostname = result[0];
const hostname = result[0];
if (flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute) {
onWhiteFound(hostname, filterRulesUrl);
} else {
onBlackFound(hostname, filterRulesUrl);
}
if (flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute) {
onWhiteFound(hostname, filterRulesUrl);
} else {
onBlackFound(hostname, filterRulesUrl);
}
switch (flag) {
case ParseType.WhiteIncludeSubdomain:
if (hostname[0] === '.') {
whitelistDomainSets.add(hostname);
} else {
whitelistDomainSets.add(`.${hostname}`);
}
break;
case ParseType.WhiteAbsolute:
switch (flag) {
case ParseType.WhiteIncludeSubdomain:
if (hostname[0] === '.') {
whitelistDomainSets.add(hostname);
break;
case ParseType.BlackIncludeSubdomain:
if (hostname[0] === '.') {
blacklistDomainSets.add(hostname);
} else {
blacklistDomainSets.add(`.${hostname}`);
}
break;
case ParseType.BlackAbsolute:
} else {
whitelistDomainSets.add(`.${hostname}`);
}
break;
case ParseType.WhiteAbsolute:
whitelistDomainSets.add(hostname);
break;
case ParseType.BlackIncludeSubdomain:
if (hostname[0] === '.') {
blacklistDomainSets.add(hostname);
break;
case ParseType.ErrorMessage:
warningMessages.push(hostname);
break;
default:
break;
}
};
} else {
blacklistDomainSets.add(`.${hostname}`);
}
break;
case ParseType.BlackAbsolute:
blacklistDomainSets.add(hostname);
break;
case ParseType.ErrorMessage:
warningMessages.push(hostname);
break;
default:
break;
}
};
const filterRules = text.split('\n');
const filterRules = text.split('\n');
span.traceChild('parse adguard filter').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
});
span.traceChild('parse adguard filter').traceSyncFn(() => {
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
});
return [
Array.from(whitelistDomainSets),
Array.from(blacklistDomainSets),
warningMessages
] as const;
},
{
ttl,
temporaryBypass,
serializer: JSON.stringify,
deserializer: JSON.parse
}
));
return [
Array.from(whitelistDomainSets),
Array.from(blacklistDomainSets),
warningMessages
] as const;
});
for (let i = 0, len = warningMessages.length; i < len; i++) {
console.warn(