Perf: use filesystem cache

This commit is contained in:
SukkaW 2023-12-31 02:32:07 +08:00
parent 6ed3695e36
commit 85801b1b9e
9 changed files with 144 additions and 73 deletions

View File

@ -12,17 +12,17 @@ import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import picocolors from 'picocolors'; import picocolors from 'picocolors';
import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line';
import { processLine } from './lib/process-line'; import { processLine } from './lib/process-line';
import { TTL, deserializeArray, fsCache, serializeArray } from './lib/cache-filesystem';
const s = new Sema(2); const s = new Sema(2);
const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-agents@latest/index.json') const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-agents@latest/index.json')
.then(res => res.json<string[]>()); .then(res => res.json<string[]>()).then(userAgents => userAgents.filter(ua => ua.startsWith('Mozilla/5.0 ')));
const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => { const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>> => {
const topUserAgents = (await Promise.all([ const topUserAgents = await latestTopUserAgentsPromise;
latestTopUserAgentsPromise,
s.acquire() const url = `https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`;
]))[0];
try { try {
const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)]; const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)];
@ -30,39 +30,51 @@ const querySpeedtestApi = async (keyword: string): Promise<Array<string | null>>
console.log(key); console.log(key);
console.time(key); console.time(key);
const res = await fetchWithRetry(`https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`, { const json = await fsCache.apply(
headers: { url,
dnt: '1', () => s.acquire().then(() => fetchWithRetry(url, {
Referer: 'https://www.speedtest.net/', headers: {
accept: 'application/json, text/plain, */*', dnt: '1',
'User-Agent': randomUserAgent, Referer: 'https://www.speedtest.net/',
'Accept-Language': 'en-US,en;q=0.9', accept: 'application/json, text/plain, */*',
...(randomUserAgent.includes('Chrome') 'User-Agent': randomUserAgent,
? { 'Accept-Language': 'en-US,en;q=0.9',
'Sec-Ch-Ua-Mobile': '?0', ...(randomUserAgent.includes('Chrome')
'Sec-Fetch-Dest': 'empty', ? {
'Sec-Fetch-Mode': 'cors', 'Sec-Ch-Ua-Mobile': '?0',
'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Dest': 'empty',
'Sec-Gpc': '1' 'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'Sec-Gpc': '1'
}
: {})
},
signal: AbortSignal.timeout(1000 * 4),
retry: {
retries: 2
}
})).then(r => r.json<Array<{ url: string }>>()).then(data => data.reduce<string[]>(
(prev, cur) => {
const hn = tldts.getHostname(cur.url, { detectIp: false });
if (hn) {
prev.push(hn);
} }
: {}) return prev;
}, }, []
signal: AbortSignal.timeout(1000 * 4), )).finally(() => s.release()),
retry: { {
retries: 2 ttl: TTL.ONE_WEEK(),
serializer: serializeArray,
deserializer: deserializeArray
} }
}); );
const json = await res.json<Array<{ url: string }>>();
console.timeEnd(key); console.timeEnd(key);
return json.map(({ url }) => tldts.getHostname(url, { detectIp: false })); return json;
} catch (e) { } catch (e) {
console.log(e); console.log(e);
return []; return [];
} finally {
s.release();
} }
}; };

View File

@ -4,10 +4,10 @@ import path from 'path';
import { fetchWithRetry } from './lib/fetch-retry'; import { fetchWithRetry } from './lib/fetch-retry';
const ASSETS_LIST = { const ASSETS_LIST = {
'www-google-analytics-com_ga.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics-ga.js', 'www-google-analytics-com_ga.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics-ga.js',
'www-googletagservices-com_gpt.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googletagservices-gpt.js', 'www-googletagservices-com_gpt.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googletagservices-gpt.js',
'www-google-analytics-com_analytics.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics.js', 'www-google-analytics-com_analytics.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics.js',
'www-googlesyndication-com_adsbygoogle.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googlesyndication-adsbygoogle.js' 'www-googlesyndication-com_adsbygoogle.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googlesyndication-adsbygoogle.js'
} as const; } as const;
const mockDir = path.resolve(import.meta.dir, '../Mock'); const mockDir = path.resolve(import.meta.dir, '../Mock');

View File

@ -1,5 +1,20 @@
import { TTL, fsCache } from './lib/cache-filesystem';
import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry'; import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry';
import { createMemoizedPromise } from './lib/memo-promise'; import { createMemoizedPromise } from './lib/memo-promise';
import { traceAsync } from './lib/trace-runner'; import { traceAsync } from './lib/trace-runner';
export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()))); export const getPublicSuffixListTextPromise = createMemoizedPromise(
() => traceAsync(
'obtain public_suffix_list',
() => fsCache.apply(
'https://publicsuffix.org/list/public_suffix_list.dat',
() => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()),
{
// https://github.com/publicsuffix/list/blob/master/.github/workflows/tld-update.yml
// Though the action runs every 24 hours, the IANA list is updated every 7 days.
// So a 3 day TTL should be enough.
ttl: TTL.THREE_DAYS()
}
)
)
);

View File

@ -127,12 +127,28 @@ export class Cache {
} }
} }
// export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }); export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
// process.on('exit', () => { // process.on('exit', () => {
// fsCache.destroy(); // fsCache.destroy();
// }); // });
const separator = String.fromCharCode(0); const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
// Add some randomness to the cache ttl to avoid thundering herd
export const TTL = {
TWLVE_HOURS: () => randomInt(9, 14) * 60 * 60 * 1000,
THREE_DAYS: () => randomInt(2, 4) * 24 * 60 * 60 * 1000,
ONE_WEEK: () => randomInt(5, 8) * 24 * 60 * 60 * 1000,
TWO_WEEKS: () => randomInt(12, 16) * 24 * 60 * 60 * 1000,
TEN_DAYS: () => randomInt(9, 11) * 24 * 60 * 60 * 1000
};
const separator = String.fromCharCode(0);
// const textEncoder = new TextEncoder();
// const textDecoder = new TextDecoder();
// export const serializeString = (str: string) => textEncoder.encode(str);
// export const deserializeString = (str: string) => textDecoder.decode(new Uint8Array(str.split(separator).map(Number)));
export const serializeSet = (set: Set<string>) => Array.from(set).join(separator); export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
export const deserializeSet = (str: string) => new Set(str.split(separator)); export const deserializeSet = (str: string) => new Set(str.split(separator));
export const serializeArray = (arr: string[]) => arr.join(separator);
export const deserializeArray = (str: string) => str.split(separator);

View File

@ -9,14 +9,15 @@ import { traceAsync } from './trace-runner';
import picocolors from 'picocolors'; import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain'; import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets'; import { fetchAssets } from './fetch-assets';
import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false; let foundDebugDomain = false;
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, _ttl: number | null = null) { export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, /* () => fsCache.apply( return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
domainListsUrl, domainListsUrl,
*/async () => { async () => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();
for await (const line of await fetchRemoteTextByLine(domainListsUrl)) { for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
@ -32,19 +33,19 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
} }
return domainSets; return domainSets;
});/* , },
{ {
ttl, ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet, serializer: serializeSet,
deserializer: deserializeSet deserializer: deserializeSet
} }
)); */ ));
} }
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, _ttl: number | null = null) { export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, /* () => fsCache.apply( return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
hostsUrl, hostsUrl,
*/async () => { async () => {
const domainSets = new Set<string>(); const domainSets = new Set<string>();
for await (const l of await fetchRemoteTextByLine(hostsUrl)) { for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
@ -73,14 +74,14 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size)); console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
return domainSets; return domainSets;
}); },
/* { {
ttl, ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet, serializer: serializeSet,
deserializer: deserializeSet deserializer: deserializeSet
} }
) */ ));
} }
// eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe? // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
@ -95,15 +96,15 @@ const enum ParseType {
export async function processFilterRules( export async function processFilterRules(
filterRulesUrl: string, filterRulesUrl: string,
fallbackUrls?: readonly string[] | undefined | null, fallbackUrls?: readonly string[] | undefined | null,
_ttl: number | null = null ttl: number | null = null
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, /* () => fsCache.apply<[ const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
white: string[], white: string[],
black: string[], black: string[],
warningMessages: string[] warningMessages: string[]
]>( ]>(
filterRulesUrl, filterRulesUrl,
*/async () => { async () => {
const whitelistDomainSets = new Set<string>(); const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>(); const blacklistDomainSets = new Set<string>();
@ -168,7 +169,7 @@ export async function processFilterRules(
// TODO-SUKKA: add cache here // TODO-SUKKA: add cache here
if (!fallbackUrls || fallbackUrls.length === 0) { if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) { for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
// don't trim here // don't trim here
lineCb(line); lineCb(line);
} }
} else { } else {
@ -191,14 +192,14 @@ export async function processFilterRules(
Array.from(blacklistDomainSets), Array.from(blacklistDomainSets),
warningMessages warningMessages
]; ];
}); },
/* { {
ttl, ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: JSON.stringify, serializer: JSON.stringify,
deserializer: JSON.parse deserializer: JSON.parse
} }
) */ ));
warningMessages.forEach(msg => { warningMessages.forEach(msg => {
console.warn( console.warn(

View File

@ -0,0 +1,16 @@
import { bench, group, run } from 'mitata';
import { randomInt as nativeRandomInt } from 'crypto';
const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
group('random-int', () => {
bench('crypto.randomInt', () => {
nativeRandomInt(3, 7);
});
bench('Math.random', () => {
randomInt(3, 7);
});
});
run();

View File

@ -1,14 +1,20 @@
import { TTL } from './cache-filesystem';
export const HOSTS = [ export const HOSTS = [
['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true], ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true],
['https://someonewhocares.org/hosts/hosts', true], ['https://someonewhocares.org/hosts/hosts', true],
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false], // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true], ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false],
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-Extension.txt', false],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
// CoinBlockerList // CoinBlockerList
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000], ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, TTL.TWO_WEEKS()],
// Curben's UrlHaus Malicious URL Blocklist // Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
@ -21,23 +27,24 @@ export const HOSTS = [
// Curben's PUP Domains Blocklist // Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt' // 'https://pup-filter.pages.dev/pup-filter-agh.txt'
// The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000] ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()]
] as const; ] as const;
export const DOMAIN_LISTS = [ export const DOMAIN_LISTS = [
// BarbBlock // BarbBlock
// The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, 10 * 24 * 60 * 60 * 1000], ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
// DigitalSide Threat-Intel - OSINT Hub // DigitalSide Threat-Intel - OSINT Hub
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true], // Update once per day
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, 24 * 60 * 60 * 1000],
// AdGuard CNAME Filter Combined // AdGuard CNAME Filter Combined
// Update on a 7 days basis, so we add a 36 hours cache ttl // Update on a 7 days basis, so we add a 3 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000] ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
] as const; ] as const;
export const ADGUARD_FILTERS = [ export const ADGUARD_FILTERS = [
@ -130,14 +137,17 @@ export const ADGUARD_FILTERS = [
// GameConsoleAdblockList // GameConsoleAdblockList
'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', 'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
// PiHoleBlocklist // PiHoleBlocklist
// Update almost once per 3 months, let's set a 10 days cache ttl
[ [
'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt', 'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
[ [
'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt' 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
] ],
TTL.TEN_DAYS()
], ],
// Spam404 // Spam404
'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', // Not actively maintained, let's use a 10 days cache ttl
['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
// Brave First Party & First Party CNAME // Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt' 'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
] as const; ] as const;

BIN
bun.lockb

Binary file not shown.

View File

@ -39,6 +39,7 @@
"eslint": "^8.56.0", "eslint": "^8.56.0",
"eslint-config-sukka": "4.1.10-beta.2", "eslint-config-sukka": "4.1.10-beta.2",
"eslint-formatter-sukka": "4.1.9", "eslint-formatter-sukka": "4.1.9",
"mitata": "^0.1.6",
"typescript": "^5.3.3" "typescript": "^5.3.3"
}, },
"resolutions": { "resolutions": {