Chore/CI: use fs cache to save bandwidth

This commit is contained in:
SukkaW
2023-12-23 04:27:35 +08:00
parent 7fbd4a570f
commit 230ac3eb18
21 changed files with 358 additions and 206 deletions

View File

@@ -0,0 +1,131 @@
// eslint-disable-next-line import/no-unresolved -- bun built-in module
import { Database } from 'bun:sqlite';
import os from 'os';
import path from 'path';
import fs from 'fs';
import picocolors from 'picocolors';
const identity = (x: any) => x;
// eslint-disable-next-line sukka-ts/no-const-enum -- bun is smart, right?
const enum CacheStatus {
Hit = 'hit',
Stale = 'stale',
Miss = 'miss'
}
export interface CacheOptions {
cachePath?: string,
tbd?: number
}
interface CacheApplyNonStringOption<T> {
ttl?: number | null,
serializer: (value: T) => string,
deserializer: (cached: string) => T,
temporaryBypass?: boolean
}
interface CacheApplyStringOption {
ttl?: number | null,
temporaryBypass?: boolean
}
type CacheApplyOption<T> = T extends string ? CacheApplyStringOption : CacheApplyNonStringOption<T>;
export class Cache {
db: Database;
tbd = 60 * 1000; // time before deletion
cachePath: string;
constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) {
this.cachePath = cachePath;
fs.mkdirSync(this.cachePath, { recursive: true });
if (tbd != null) this.tbd = tbd;
const db = new Database(path.join(this.cachePath, 'cache.db'));
db.exec('PRAGMA journal_mode = WAL');
db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run();
db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run();
// perform purge on startup
// ttl + tbd < now => ttl < now - tbd
const now = Date.now() - this.tbd;
db.prepare('DELETE FROM cache WHERE ttl < ?').run(now);
this.db = db;
}
set(key: string, value: string, ttl = 60 * 1000): void {
const insert = this.db.prepare(
'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid'
);
insert.run({
$key: key,
$value: value,
$valid: Date.now() + ttl
});
}
get(key: string, defaultValue?: string): string | undefined {
const rv = this.db.prepare<{ value: string }, string>(
'SELECT value FROM cache WHERE key = ?'
).get(key);
if (!rv) return defaultValue;
return rv.value;
}
has(key: string): CacheStatus {
const now = Date.now();
const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key);
return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale);
}
del(key: string): void {
this.db.prepare('DELETE FROM cache WHERE key = ?').run(key);
}
async apply<T>(
key: string,
fn: () => Promise<T>,
opt: CacheApplyOption<T>
): Promise<T> {
const { ttl, temporaryBypass } = opt;
if (temporaryBypass) {
return fn();
}
if (ttl === null) {
this.del(key);
return fn();
}
const cached = this.get(key);
let value: T;
if (cached == null) {
console.log(picocolors.yellow('[cache] miss'), picocolors.gray(key));
value = await fn();
const serializer = 'serializer' in opt ? opt.serializer : identity;
this.set(key, serializer(value), ttl);
} else {
console.log(picocolors.green('[cache] hit'), picocolors.gray(key));
const deserializer = 'deserializer' in opt ? opt.deserializer : identity;
value = deserializer(cached);
}
return value;
}
}
export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') });
const separator = String.fromCharCode(0);
export const serializeSet = (set: Set<string>) => Array.from(set).join(separator);
export const deserializeSet = (str: string) => new Set(str.split(separator));

View File

@@ -1,5 +1,7 @@
import type { BunFile } from 'bun';
import { fetchWithRetry, defaultRequestInit } from './fetch-retry';
import { fsCache } from './cache-filesystem';
import picocolors from 'picocolors';
// import { TextLineStream } from './text-line-transform-stream';
// import { PolyfillTextDecoderStream } from './text-decoder-stream';
@@ -78,6 +80,6 @@ export async function *createReadlineInterfaceFromResponse(resp: Response): Asyn
}
}
export function fetchRemoteTextAndReadByLine(url: string | URL) {
export function fetchRemoteTextByLine(url: string | URL) {
return fetchWithRetry(url, defaultRequestInit).then(res => createReadlineInterfaceFromResponse(res as Response));
}

View File

@@ -1,23 +1,13 @@
import { toASCII } from 'punycode';
import path from 'path';
import { traceAsync } from './trace-runner';
import { defaultRequestInit, fetchWithRetry } from './fetch-retry';
import { createMemoizedPromise } from './memo-promise';
import { getPublicSuffixListTextPromise } from '../download-publicsuffixlist';
const publicSuffixPath = path.resolve(import.meta.dir, '../../node_modules/.cache/public_suffix_list_dat.txt');
const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix instance', async () => {
export const getGorhillPublicSuffixPromise = createMemoizedPromise(() => traceAsync('create gorhill public suffix instance', async () => {
const customFetch = (url: string | URL) => Promise.resolve(Bun.file(url));
const publicSuffixFile = Bun.file(publicSuffixPath);
const [publicSuffixListDat, { default: gorhill }] = await Promise.all([
await publicSuffixFile.exists()
? publicSuffixFile.text()
: fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => {
console.log('public_suffix_list.dat not found, fetch directly from remote.');
return r.text();
}),
getPublicSuffixListTextPromise(),
import('@gorhill/publicsuffixlist')
]);
@@ -25,6 +15,4 @@ const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix in
await gorhill.enableWASM({ customFetch });
return gorhill;
});
export const getGorhillPublicSuffixPromise = createMemoizedPromise(getGorhillPublicSuffix);
}));

View File

@@ -1,4 +1,4 @@
import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
import { fetchRemoteTextByLine } from './fetch-text-by-line';
import { parse } from 'tldts';
const isDomainLoose = (domain: string): boolean => {
@@ -8,7 +8,7 @@ const isDomainLoose = (domain: string): boolean => {
export const parseFelixDnsmasq = async (url: string | URL): Promise<string[]> => {
const res: string[] = [];
for await (const line of await fetchRemoteTextAndReadByLine(url)) {
for await (const line of await fetchRemoteTextByLine(url)) {
if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) {
const domain = line.replace('server=/', '').replace('/114.114.114.114', '');
if (isDomainLoose(domain)) {

View File

@@ -1,5 +1,5 @@
// @ts-check
import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line';
import { fetchRemoteTextByLine } from './fetch-text-by-line';
import { NetworkFilter } from '@cliqz/adblocker';
import { processLine } from './process-line';
import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix';
@@ -9,61 +9,79 @@ import { traceAsync } from './trace-runner';
import picocolors from 'picocolors';
import { normalizeDomain } from './normalize-domain';
import { fetchAssets } from './fetch-assets';
import { deserializeSet, fsCache, serializeSet } from './cache-filesystem';
const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null
let foundDebugDomain = false;
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => {
const domainSets = new Set<string>();
export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) {
return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply(
domainListsUrl,
async () => {
const domainSets = new Set<string>();
for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) {
const domainToAdd = processLine(line);
if (!domainToAdd) continue;
for await (const line of await fetchRemoteTextByLine(domainListsUrl)) {
const domainToAdd = processLine(line);
if (!domainToAdd) continue;
if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
foundDebugDomain = true;
}
if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
foundDebugDomain = true;
}
domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
}
return domainSets;
});
}
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) {
return traceAsync(`- processHosts: ${hostsUrl}`, async () => {
const domainSets = new Set<string>();
for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) {
const line = processLine(l);
if (!line) {
continue;
}
const domain = line.split(/\s/)[1];
if (!domain) {
continue;
}
const _domain = domain.trim();
if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
foundDebugDomain = true;
}
const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
if (domainToAdd) {
domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
}
return domainSets;
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
));
}
export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) {
return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply(
hostsUrl,
async () => {
const domainSets = new Set<string>();
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
for await (const l of await fetchRemoteTextByLine(hostsUrl)) {
const line = processLine(l);
if (!line) {
continue;
}
return domainSets;
});
const domain = line.split(/\s/)[1];
if (!domain) {
continue;
}
const _domain = domain.trim();
if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND));
foundDebugDomain = true;
}
const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain);
if (domainToAdd) {
domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd);
}
}
console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size));
return domainSets;
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: serializeSet,
deserializer: deserializeSet
}
));
}
// eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe?
@@ -77,90 +95,111 @@ const enum ParseType {
export async function processFilterRules(
filterRulesUrl: string,
fallbackUrls?: readonly string[] | undefined
): Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }> {
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
fallbackUrls?: readonly string[] | undefined | null,
ttl: number | null = null
): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> {
const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[
white: string[],
black: string[],
warningMessages: string[]
]>(
filterRulesUrl,
async () => {
const whitelistDomainSets = new Set<string>();
const blacklistDomainSets = new Set<string>();
const warningMessages: string[] = [];
const warningMessages: string[] = [];
await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => {
const gorhill = await getGorhillPublicSuffixPromise();
const gorhill = await getGorhillPublicSuffixPromise();
/**
/**
* @param {string} line
*/
const lineCb = (line: string) => {
const result = parse(line, gorhill);
if (!result) {
return;
}
const flag = result[1];
const hostname = result[0];
if (DEBUG_DOMAIN_TO_FIND) {
if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(
picocolors.red(filterRulesUrl),
flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute
? '(white)'
: '(black)',
picocolors.bold(DEBUG_DOMAIN_TO_FIND)
);
foundDebugDomain = true;
const lineCb = (line: string) => {
const result = parse(line, gorhill);
if (!result) {
return;
}
}
switch (flag) {
case ParseType.WhiteIncludeSubdomain:
if (hostname[0] !== '.') {
whitelistDomainSets.add(`.${hostname}`);
} else {
const flag = result[1];
const hostname = result[0];
if (DEBUG_DOMAIN_TO_FIND) {
if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) {
console.warn(
picocolors.red(filterRulesUrl),
flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute
? '(white)'
: '(black)',
picocolors.bold(DEBUG_DOMAIN_TO_FIND)
);
foundDebugDomain = true;
}
}
switch (flag) {
case ParseType.WhiteIncludeSubdomain:
if (hostname[0] !== '.') {
whitelistDomainSets.add(`.${hostname}`);
} else {
whitelistDomainSets.add(hostname);
}
break;
case ParseType.WhiteAbsolute:
whitelistDomainSets.add(hostname);
}
break;
case ParseType.WhiteAbsolute:
whitelistDomainSets.add(hostname);
break;
case ParseType.BlackAbsolute:
blacklistDomainSets.add(hostname);
break;
case ParseType.BlackIncludeSubdomain:
if (hostname[0] !== '.') {
blacklistDomainSets.add(`.${hostname}`);
} else {
break;
case ParseType.BlackAbsolute:
blacklistDomainSets.add(hostname);
}
break;
case ParseType.ErrorMessage:
warningMessages.push(hostname);
break;
default:
break;
}
};
break;
case ParseType.BlackIncludeSubdomain:
if (hostname[0] !== '.') {
blacklistDomainSets.add(`.${hostname}`);
} else {
blacklistDomainSets.add(hostname);
}
break;
case ParseType.ErrorMessage:
warningMessages.push(hostname);
break;
default:
break;
}
};
if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) {
// TODO-SUKKA: add cache here
if (!fallbackUrls || fallbackUrls.length === 0) {
for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) {
// don't trim here
lineCb(line);
}
} else {
const filterRules = (await traceAsync(
picocolors.gray(`- download ${filterRulesUrl}`),
() => fetchAssets(filterRulesUrl, fallbackUrls),
picocolors.gray
)).split('\n');
lineCb(line);
}
} else {
const filterRules = (await traceAsync(
picocolors.gray(`- download ${filterRulesUrl}`),
() => fetchAssets(filterRulesUrl, fallbackUrls),
picocolors.gray
)).split('\n');
const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
console.time(key);
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`);
console.time(key);
for (let i = 0, len = filterRules.length; i < len; i++) {
lineCb(filterRules[i]);
}
console.timeEnd(key);
}
console.timeEnd(key);
return [
Array.from(whitelistDomainSets),
Array.from(blacklistDomainSets),
warningMessages
];
},
{
ttl,
temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null,
serializer: JSON.stringify,
deserializer: JSON.parse
}
});
));
warningMessages.forEach(msg => {
console.warn(
@@ -172,13 +211,13 @@ export async function processFilterRules(
console.log(
picocolors.gray('[process filter]'),
picocolors.gray(filterRulesUrl),
picocolors.gray(`white: ${whitelistDomainSets.size}`),
picocolors.gray(`black: ${blacklistDomainSets.size}`)
picocolors.gray(`white: ${white.length}`),
picocolors.gray(`black: ${black.length}`)
);
return {
white: whitelistDomainSets,
black: blacklistDomainSets,
white,
black,
foundDebugDomain
};
}

View File

@@ -4,7 +4,7 @@ export const processLine = (line: string): string | null => {
}
const trimmed: string = line.trim();
if (trimmed === '') {
if (trimmed.length === 0) {
return null;
}

View File

@@ -7,11 +7,11 @@ export const HOSTS = [
['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false],
['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true],
// CoinBlockerList
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true],
// Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl
['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000],
// Curben's UrlHaus Malicious URL Blocklist
// 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt',
// 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
// 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/urlhaus-filter/urlhaus-filter-online.txt',
['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, true],
// Curben's Phishing URL Blocklist
// Covered by lib/get-phishing-domains.ts
@@ -21,14 +21,24 @@ export const HOSTS = [
// Curben's PUP Domains Blocklist
// 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt'
// 'https://pup-filter.pages.dev/pup-filter-agh.txt'
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true],
// The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl
['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000],
// BarbBlock
['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true]
// The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl
['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true, 10 * 24 * 60 * 60 * 1000]
] as const;
export const DOMAIN_LISTS = [
// DigitalSide Threat-Intel - OSINT Hub
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true]
['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true],
// AdGuard CNAME Filter Combined
// Update on a 7 days basis, so we add a 36 hours cache ttl
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000],
['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000]
] as const;
export const ADGUARD_FILTERS = [
@@ -41,7 +51,8 @@ export const ADGUARD_FILTERS = [
'https://secure.fanboy.co.nz/easylist.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt',
'https://ublockorigin.pages.dev/thirdparties/easylist.txt'
]
],
12 * 60 * 60 * 1000
],
// EasyPrivacy
[
@@ -52,7 +63,8 @@ export const ADGUARD_FILTERS = [
'https://easylist-downloads.adblockplus.org/easyprivacy.txt',
'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt',
'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt'
]
],
12 * 60 * 60 * 1000
],
// AdGuard DNS Filter
[
@@ -62,12 +74,6 @@ export const ADGUARD_FILTERS = [
'https://adguardteam.github.io/HostlistsRegistry/assets/filter_1.txt'
]
],
// AdGuard CNAME Filter Combined
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads.txt',
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers.txt',
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs.txt',
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites.txt',
'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt',
// uBlock Origin Filter List
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt',