Perf: use filesystem cache

This commit is contained in:
SukkaW
2023-12-31 02:32:07 +08:00
parent 6ed3695e36
commit 85801b1b9e
9 changed files with 144 additions and 73 deletions

View File

@@ -127,12 +127,28 @@ export class Cache {
}
}
// export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
// process.on('exit', () => {
// fsCache.destroy();
// });
const separator = String.fromCharCode(0);
const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
// Add some randomness to the cache ttl to avoid thundering herd
export const TTL = {
TWLVE_HOURS: () => randomInt(9, 14) * 60 * 60 * 1000,
THREE_DAYS: () => randomInt(2, 4) * 24 * 60 * 60 * 1000,
ONE_WEEK: () => randomInt(5, 8) * 24 * 60 * 60 * 1000,
TWO_WEEKS: () => randomInt(12, 16) * 24 * 60 * 60 * 1000,
TEN_DAYS: () => randomInt(9, 11) * 24 * 60 * 60 * 1000
};
const separator = String.fromCharCode(0);
// const textEncoder = new TextEncoder();
// const textDecoder = new TextDecoder();
// export const serializeString = (str: string) => textEncoder.encode(str);
// export const deserializeString = (str: string) => textDecoder.decode(new Uint8Array(str.split(separator).map(Number)));
export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
export const deserializeSet = (str: string) => new Set(str.split(separator));
export const serializeArray = (arr: string[]) => arr.join(separator);
export const deserializeArray = (str: string) => str.split(separator);

View File

@@ -9,14 +9,15 @@ import { traceAsync } from './trace-runner';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets';
import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, _ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, /* () => fsCache.apply(
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
domainListsUrl,
*/async () => {
async () => {
const domainSets = new Set<string>();
for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
@@ -32,19 +33,19 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain =
}
return domainSets;
});/* ,
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
)); */
));
}
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, _ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, /* () => fsCache.apply(
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
hostsUrl,
*/async () => {
async () => {
const domainSets = new Set<string>();
for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
@@ -73,14 +74,14 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
return domainSets;
});
/* {
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
) */
));
}
// eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
@@ -95,15 +96,15 @@ const enum ParseType {
export async function processFilterRules(
filterRulesUrl: string,
fallbackUrls?: readonly string[] | undefined | null,
_ttl: number | null = null
ttl: number | null = null
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, /* () => fsCache.apply<[
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
white: string[],
black: string[],
warningMessages: string[]
]>(
filterRulesUrl,
*/async () => {
async () => {
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
@@ -168,7 +169,7 @@ export async function processFilterRules(
// TODO-SUKKA: add cache here
if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
// don't trim here
// don't trim here
lineCb(line);
}
} else {
@@ -191,14 +192,14 @@ export async function processFilterRules(
Array.from(blacklistDomainSets),
warningMessages
];
});
/* {
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: JSON.stringify,
deserializer: JSON.parse
}
) */
));
warningMessages.forEach(msg => {
console.warn(

View File

@@ -0,0 +1,16 @@
import { bench, group, run } from 'mitata';
import { randomInt as nativeRandomInt } from 'crypto';
const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min;
group('random-int', () => {
bench('crypto.randomInt', () => {
nativeRandomInt(3, 7);
});
bench('Math.random', () => {
randomInt(3, 7);
});
});
run();

View File

@@ -1,14 +1,20 @@
import { TTL } from './cache-filesystem';
export const HOSTS = [
['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true],
['https://someonewhocares.org/hosts/hosts', true],
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false],
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true],
// no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl
['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()],
// have not been updated for more than a year, so we set a 14 days cache ttl
['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false],
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false],
['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-Extension.txt', false],
// ad-wars is not actively maintained, so we set a 7 days cache ttl
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
// CoinBlockerList
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000],
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, TTL.TWO_WEEKS()],
// Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
@@ -21,23 +27,24 @@ export const HOSTS = [
// Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt'
// The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000]
// The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()]
] as const;
export const DOMAIN_LISTS = [
// BarbBlock
// The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, 10 * 24 * 60 * 60 * 1000],
// The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()],
// DigitalSide Threat-Intel - OSINT Hub
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true],
// Update once per day
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, 24 * 60 * 60 * 1000],
// AdGuard CNAME Filter Combined
// Update on a 7 days basis, so we add a 36 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000]
// Update on a 7 days basis, so we add a 3 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()]
] as const;
export const ADGUARD_FILTERS = [
@@ -130,14 +137,17 @@ export const ADGUARD_FILTERS = [
// GameConsoleAdblockList
'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
// PiHoleBlocklist
// Update almost once per 3 months, let's set a 10 days cache ttl
[
'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt',
[
'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt'
]
],
TTL.TEN_DAYS()
],
// Spam404
'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt',
// Not actively maintained, let's use a 10 days cache ttl
['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()],
// Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt'
] as const;