From 230ac3eb1896444ba045b5630468be670a01a992 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sat, 23 Dec 2023 04:27:35 +0800 Subject: [PATCH] Chore/CI: use fs cache to save bandwidth --- .github/workflows/main.yml | 6 + .gitignore | 1 + Build/build-anti-bogus-domain.ts | 4 +- Build/build-cdn-conf.ts | 20 +- Build/build-chn-cidr.ts | 4 +- Build/build-internal-reverse-chn-cidr.ts | 4 +- Build/build-microsoft-cdn.ts | 4 +- Build/build-reject-domainset.ts | 6 +- Build/build-speedtest-domainset.ts | 4 +- Build/download-previous-build.ts | 13 +- Build/download-publicsuffixlist.ts | 10 + Build/index.ts | 15 +- Build/lib/cache-filesystem.ts | 131 +++++++++++ Build/lib/fetch-text-by-line.ts | 4 +- Build/lib/get-gorhill-publicsuffix.ts | 20 +- Build/lib/parse-dnsmasq.ts | 4 +- Build/lib/parse-filter.ts | 275 +++++++++++++---------- Build/lib/process-line.ts | 2 +- Build/lib/reject-data-source.ts | 32 +-- Build/validate-domestic.ts | 4 +- Source/domainset/cdn.conf | 1 + 21 files changed, 358 insertions(+), 206 deletions(-) create mode 100644 Build/download-publicsuffixlist.ts create mode 100644 Build/lib/cache-filesystem.ts diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 36222e93..a2e5df60 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,6 +15,12 @@ jobs: uses: actions/checkout@v4 with: persist-credentials: false + - name: Cache cache.db + uses: actions/cache@v3 + with: + path: .cache + key: ${{ runner.os }}-v1 + - uses: oven-sh/setup-bun@v1 with: bun-version: latest diff --git a/.gitignore b/.gitignore index 1e3c2d91..fdfee4e0 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ node_modules .clinic .wireit +.cache public # $ build output diff --git a/Build/build-anti-bogus-domain.ts b/Build/build-anti-bogus-domain.ts index 4af72b63..4c85f2f9 100644 --- a/Build/build-anti-bogus-domain.ts +++ b/Build/build-anti-bogus-domain.ts @@ -1,7 +1,7 @@ // @ts-check import path from 'path'; import { createRuleset } from './lib/create-file'; -import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; import { processLine } from './lib/process-line'; import { task } from './lib/trace-runner'; import { SHARED_DESCRIPTION } from './lib/constants'; @@ -9,7 +9,7 @@ import { isProbablyIpv4, isProbablyIpv6 } from './lib/is-fast-ip'; const getBogusNxDomainIPs = async () => { const result: string[] = []; - for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf')) { + for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/bogus-nxdomain.china.conf')) { if (line && line.startsWith('bogus-nxdomain=')) { const ip = line.slice(15).trim(); if (isProbablyIpv4(ip)) { diff --git a/Build/build-cdn-conf.ts b/Build/build-cdn-conf.ts index b1a986fa..ca719323 100644 --- a/Build/build-cdn-conf.ts +++ b/Build/build-cdn-conf.ts @@ -1,27 +1,15 @@ import path from 'path'; import { createRuleset } from './lib/create-file'; -import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line'; +import { readFileByLine } from './lib/fetch-text-by-line'; import { createTrie } from './lib/trie'; import { task } from './lib/trace-runner'; import { processLine } from './lib/process-line'; import { SHARED_DESCRIPTION } from './lib/constants'; - -const publicSuffixPath: string = path.resolve(import.meta.dir, '../node_modules/.cache/public_suffix_list_dat.txt'); - +import { getPublicSuffixListTextPromise } from './download-publicsuffixlist'; const getS3OSSDomains = async (): Promise> => { const trie = createTrie(); - - const publicSuffixFile = Bun.file(publicSuffixPath); - - if (await publicSuffixFile.exists()) { - for await (const line of readFileByLine(publicSuffixFile)) { - trie.add(line); - } - } else { - console.log('public_suffix_list.dat not found, fetch directly from remote.'); - for await (const line of await fetchRemoteTextAndReadByLine('https://publicsuffix.org/list/public_suffix_list.dat')) { - trie.add(line); - } + for await (const line of (await getPublicSuffixListTextPromise()).split('\n')) { + trie.add(line); } /** diff --git a/Build/build-chn-cidr.ts b/Build/build-chn-cidr.ts index 14c1dfe4..b32fb838 100644 --- a/Build/build-chn-cidr.ts +++ b/Build/build-chn-cidr.ts @@ -1,4 +1,4 @@ -import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { resolve as pathResolve } from 'path'; import { compareAndWriteFile, withBannerArray } from './lib/create-file'; import { processLineFromReadline } from './lib/process-line'; @@ -21,7 +21,7 @@ const INCLUDE_CIDRS = [ export const getChnCidrPromise = createMemoizedPromise(async () => { const cidr = await traceAsync( picocolors.gray('download chnroutes2'), - async () => processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')), + async () => processLineFromReadline(await fetchRemoteTextByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')), picocolors.gray ); return traceSync( diff --git a/Build/build-internal-reverse-chn-cidr.ts b/Build/build-internal-reverse-chn-cidr.ts index 45afc0a5..c3a5bcb5 100644 --- a/Build/build-internal-reverse-chn-cidr.ts +++ b/Build/build-internal-reverse-chn-cidr.ts @@ -1,4 +1,4 @@ -import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { processLineFromReadline } from './lib/process-line'; import path from 'path'; import { task } from './lib/trace-runner'; @@ -26,7 +26,7 @@ const RESERVED_IPV4_CIDR = [ ]; export const buildInternalReverseChnCIDR = task(import.meta.path, async () => { - const cidr = await processLineFromReadline(await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')); + const cidr = await processLineFromReadline(await fetchRemoteTextByLine('https://raw.githubusercontent.com/misakaio/chnroutes2/master/chnroutes.txt')); const reversedCidr = merge( exclude( diff --git a/Build/build-microsoft-cdn.ts b/Build/build-microsoft-cdn.ts index e7d24745..b1b21777 100644 --- a/Build/build-microsoft-cdn.ts +++ b/Build/build-microsoft-cdn.ts @@ -1,7 +1,7 @@ import path from 'path'; import { task, traceAsync } from './lib/trace-runner'; import { createRuleset } from './lib/create-file'; -import { fetchRemoteTextAndReadByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { createTrie } from './lib/trie'; import { SHARED_DESCRIPTION } from './lib/constants'; import { createMemoizedPromise } from './lib/memo-promise'; @@ -22,7 +22,7 @@ const BLACKLIST = [ export const getMicrosoftCdnRulesetPromise = createMemoizedPromise(async () => { const set = await traceAsync('fetch accelerated-domains.china.conf', async () => { const trie = createTrie(); - for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) { + for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) { if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) { const domain = line.slice(8, -16); trie.add(domain); diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 351970e2..857fd59e 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -32,16 +32,16 @@ export const buildRejectDomainSet = task(import.meta.path, async () => { const [gorhill] = await Promise.all([ getGorhillPublicSuffixPromise(), // Parse from remote hosts & domain lists - ...HOSTS.map(entry => processHosts(entry[0], entry[1]).then(hosts => { + ...HOSTS.map(entry => processHosts(entry[0], entry[1], entry[2], entry[3]).then(hosts => { hosts.forEach(host => { domainSets.add(host); }); })), - ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1])), + ...DOMAIN_LISTS.map(entry => processDomainLists(entry[0], entry[1], entry[2])), ...ADGUARD_FILTERS.map(input => { const promise = typeof input === 'string' ? processFilterRules(input) - : processFilterRules(input[0], input[1]); + : processFilterRules(input[0], input[1], input[2]); return promise.then(({ white, black, foundDebugDomain }) => { if (foundDebugDomain) { diff --git a/Build/build-speedtest-domainset.ts b/Build/build-speedtest-domainset.ts index b8aebfe6..55a4172d 100644 --- a/Build/build-speedtest-domainset.ts +++ b/Build/build-speedtest-domainset.ts @@ -21,9 +21,8 @@ const querySpeedtestApi = async (keyword: string): Promise> s.acquire() ]))[0]; - const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)]; - try { + const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)]; const key = `fetch speedtest endpoints: ${keyword}`; console.time(key); @@ -47,6 +46,7 @@ const querySpeedtestApi = async (keyword: string): Promise> } const json = await res.json() as Array<{ url: string }>; + s.release(); console.timeEnd(key); diff --git a/Build/download-previous-build.ts b/Build/download-previous-build.ts index 2e2918f9..814e69cc 100644 --- a/Build/download-previous-build.ts +++ b/Build/download-previous-build.ts @@ -1,7 +1,6 @@ import fs from 'fs'; import fsp from 'fs/promises'; import path from 'path'; -import os from 'os'; import { Readable } from 'stream'; import { pipeline } from 'stream/promises'; import { readFileByLine } from './lib/fetch-text-by-line'; @@ -85,16 +84,6 @@ export const downloadPreviousBuild = task(import.meta.path, async () => { ); }); -export const downloadPublicSuffixList = task(import.meta.path, async () => { - const publicSuffixPath = path.resolve(import.meta.dir, '../node_modules/.cache/public_suffix_list_dat.txt'); - const resp = await fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit); - - return Bun.write(publicSuffixPath, resp as Response); -}, 'download-publicsuffixlist'); - if (import.meta.main) { - Promise.all([ - downloadPreviousBuild(), - downloadPublicSuffixList() - ]); + downloadPreviousBuild(); } diff --git a/Build/download-publicsuffixlist.ts b/Build/download-publicsuffixlist.ts new file mode 100644 index 00000000..6258a560 --- /dev/null +++ b/Build/download-publicsuffixlist.ts @@ -0,0 +1,10 @@ +import { fsCache } from './lib/cache-filesystem'; +import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry'; +import { createMemoizedPromise } from './lib/memo-promise'; +import { traceAsync } from './lib/trace-runner'; + +export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fsCache.apply( + 'public_suffix_list.dat', + () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()), + { ttl: 24 * 60 * 60 * 1000 } +))); diff --git a/Build/index.ts b/Build/index.ts index d4a851b5..e0164339 100644 --- a/Build/index.ts +++ b/Build/index.ts @@ -1,4 +1,4 @@ -import { downloadPreviousBuild, downloadPublicSuffixList } from './download-previous-build'; +import { downloadPreviousBuild } from './download-previous-build'; import { buildCommon } from './build-common'; import { buildAntiBogusDomain } from './build-anti-bogus-domain'; import { buildAppleCdn } from './build-apple-cdn'; @@ -33,23 +33,15 @@ import type { TaskResult } from './lib/trace-runner'; // const buildInternalReverseChnCIDRWorker = new Worker(new URL('./workers/build-internal-reverse-chn-cidr-worker.ts', import.meta.url)); const downloadPreviousBuildPromise = downloadPreviousBuild(); - const downloadPublicSuffixListPromise = downloadPublicSuffixList(); const buildCommonPromise = downloadPreviousBuildPromise.then(() => buildCommon()); const buildAntiBogusDomainPromise = downloadPreviousBuildPromise.then(() => buildAntiBogusDomain()); const buildAppleCdnPromise = downloadPreviousBuildPromise.then(() => buildAppleCdn()); - const buildCdnConfPromise = Promise.all([ - downloadPreviousBuildPromise, - downloadPublicSuffixListPromise - ]).then(() => buildCdnConf()); - const buildRejectDomainSetPromise = Promise.all([ - downloadPreviousBuildPromise, - downloadPublicSuffixListPromise - ]).then(() => buildRejectDomainSet()); + const buildCdnConfPromise = downloadPreviousBuildPromise.then(() => buildCdnConf()); + const buildRejectDomainSetPromise = downloadPreviousBuildPromise.then(() => buildRejectDomainSet()); const buildTelegramCIDRPromise = downloadPreviousBuildPromise.then(() => buildTelegramCIDR()); const buildChnCidrPromise = downloadPreviousBuildPromise.then(() => buildChnCidr()); const buildSpeedtestDomainSetPromise = downloadPreviousBuildPromise.then(() => buildSpeedtestDomainSet()); const buildInternalCDNDomainsPromise = Promise.all([ - downloadPublicSuffixListPromise, buildCommonPromise, buildCdnConfPromise ]).then(() => buildInternalCDNDomains()); @@ -84,7 +76,6 @@ import type { TaskResult } from './lib/trace-runner'; const stats = await Promise.all([ downloadPreviousBuildPromise, - downloadPublicSuffixListPromise, buildCommonPromise, buildAntiBogusDomainPromise, buildAppleCdnPromise, diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts new file mode 100644 index 00000000..e104cb5d --- /dev/null +++ b/Build/lib/cache-filesystem.ts @@ -0,0 +1,131 @@ +// eslint-disable-next-line import/no-unresolved -- bun built-in module +import { Database } from 'bun:sqlite'; +import os from 'os'; +import path from 'path'; +import fs from 'fs'; +import picocolors from 'picocolors'; + +const identity = (x: any) => x; + +// eslint-disable-next-line sukka-ts/no-const-enum -- bun is smart, right? +const enum CacheStatus { + Hit = 'hit', + Stale = 'stale', + Miss = 'miss' +} + +export interface CacheOptions { + cachePath?: string, + tbd?: number +} + +interface CacheApplyNonStringOption { + ttl?: number | null, + serializer: (value: T) => string, + deserializer: (cached: string) => T, + temporaryBypass?: boolean +} + +interface CacheApplyStringOption { + ttl?: number | null, + temporaryBypass?: boolean +} + +type CacheApplyOption = T extends string ? CacheApplyStringOption : CacheApplyNonStringOption; + +export class Cache { + db: Database; + tbd = 60 * 1000; // time before deletion + cachePath: string; + + constructor({ cachePath = path.join(os.tmpdir() || '/tmp', 'hdc'), tbd }: CacheOptions = {}) { + this.cachePath = cachePath; + fs.mkdirSync(this.cachePath, { recursive: true }); + if (tbd != null) this.tbd = tbd; + + const db = new Database(path.join(this.cachePath, 'cache.db')); + db.exec('PRAGMA journal_mode = WAL'); + + db.prepare('CREATE TABLE IF NOT EXISTS cache (key TEXT PRIMARY KEY, value TEXT, ttl REAL NOT NULL);').run(); + db.prepare('CREATE INDEX IF NOT EXISTS cache_ttl ON cache (ttl);').run(); + + // perform purge on startup + + // ttl + tbd < now => ttl < now - tbd + const now = Date.now() - this.tbd; + db.prepare('DELETE FROM cache WHERE ttl < ?').run(now); + + this.db = db; + } + + set(key: string, value: string, ttl = 60 * 1000): void { + const insert = this.db.prepare( + 'INSERT INTO cache (key, value, ttl) VALUES ($key, $value, $valid) ON CONFLICT(key) DO UPDATE SET value = $value, ttl = $valid' + ); + + insert.run({ + $key: key, + $value: value, + $valid: Date.now() + ttl + }); + } + + get(key: string, defaultValue?: string): string | undefined { + const rv = this.db.prepare<{ value: string }, string>( + 'SELECT value FROM cache WHERE key = ?' + ).get(key); + + if (!rv) return defaultValue; + return rv.value; + } + + has(key: string): CacheStatus { + const now = Date.now(); + const rv = this.db.prepare<{ ttl: number }, string>('SELECT ttl FROM cache WHERE key = ?').get(key); + + return !rv ? CacheStatus.Miss : (rv.ttl > now ? CacheStatus.Hit : CacheStatus.Stale); + } + + del(key: string): void { + this.db.prepare('DELETE FROM cache WHERE key = ?').run(key); + } + + async apply( + key: string, + fn: () => Promise, + opt: CacheApplyOption + ): Promise { + const { ttl, temporaryBypass } = opt; + + if (temporaryBypass) { + return fn(); + } + if (ttl === null) { + this.del(key); + return fn(); + } + + const cached = this.get(key); + let value: T; + if (cached == null) { + console.log(picocolors.yellow('[cache] miss'), picocolors.gray(key)); + value = await fn(); + + const serializer = 'serializer' in opt ? opt.serializer : identity; + this.set(key, serializer(value), ttl); + } else { + console.log(picocolors.green('[cache] hit'), picocolors.gray(key)); + + const deserializer = 'deserializer' in opt ? opt.deserializer : identity; + value = deserializer(cached); + } + return value; + } +} + +export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }); + +const separator = String.fromCharCode(0); + +export const serializeSet = (set: Set) => Array.from(set).join(separator); +export const deserializeSet = (str: string) => new Set(str.split(separator)); diff --git a/Build/lib/fetch-text-by-line.ts b/Build/lib/fetch-text-by-line.ts index c88cc9e0..352e1f08 100644 --- a/Build/lib/fetch-text-by-line.ts +++ b/Build/lib/fetch-text-by-line.ts @@ -1,5 +1,7 @@ import type { BunFile } from 'bun'; import { fetchWithRetry, defaultRequestInit } from './fetch-retry'; +import { fsCache } from './cache-filesystem'; +import picocolors from 'picocolors'; // import { TextLineStream } from './text-line-transform-stream'; // import { PolyfillTextDecoderStream } from './text-decoder-stream'; @@ -78,6 +80,6 @@ export async function *createReadlineInterfaceFromResponse(resp: Response): Asyn } } -export function fetchRemoteTextAndReadByLine(url: string | URL) { +export function fetchRemoteTextByLine(url: string | URL) { return fetchWithRetry(url, defaultRequestInit).then(res => createReadlineInterfaceFromResponse(res as Response)); } diff --git a/Build/lib/get-gorhill-publicsuffix.ts b/Build/lib/get-gorhill-publicsuffix.ts index 62c28978..f3765590 100644 --- a/Build/lib/get-gorhill-publicsuffix.ts +++ b/Build/lib/get-gorhill-publicsuffix.ts @@ -1,23 +1,13 @@ import { toASCII } from 'punycode'; -import path from 'path'; import { traceAsync } from './trace-runner'; -import { defaultRequestInit, fetchWithRetry } from './fetch-retry'; import { createMemoizedPromise } from './memo-promise'; +import { getPublicSuffixListTextPromise } from '../download-publicsuffixlist'; -const publicSuffixPath = path.resolve(import.meta.dir, '../../node_modules/.cache/public_suffix_list_dat.txt'); - -const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix instance', async () => { +export const getGorhillPublicSuffixPromise = createMemoizedPromise(() => traceAsync('create gorhill public suffix instance', async () => { const customFetch = (url: string | URL) => Promise.resolve(Bun.file(url)); - const publicSuffixFile = Bun.file(publicSuffixPath); - const [publicSuffixListDat, { default: gorhill }] = await Promise.all([ - await publicSuffixFile.exists() - ? publicSuffixFile.text() - : fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => { - console.log('public_suffix_list.dat not found, fetch directly from remote.'); - return r.text(); - }), + getPublicSuffixListTextPromise(), import('@gorhill/publicsuffixlist') ]); @@ -25,6 +15,4 @@ const getGorhillPublicSuffix = () => traceAsync('create gorhill public suffix in await gorhill.enableWASM({ customFetch }); return gorhill; -}); - -export const getGorhillPublicSuffixPromise = createMemoizedPromise(getGorhillPublicSuffix); +})); diff --git a/Build/lib/parse-dnsmasq.ts b/Build/lib/parse-dnsmasq.ts index 2f711e87..1c1081fb 100644 --- a/Build/lib/parse-dnsmasq.ts +++ b/Build/lib/parse-dnsmasq.ts @@ -1,4 +1,4 @@ -import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line'; +import { fetchRemoteTextByLine } from './fetch-text-by-line'; import { parse } from 'tldts'; const isDomainLoose = (domain: string): boolean => { @@ -8,7 +8,7 @@ const isDomainLoose = (domain: string): boolean => { export const parseFelixDnsmasq = async (url: string | URL): Promise => { const res: string[] = []; - for await (const line of await fetchRemoteTextAndReadByLine(url)) { + for await (const line of await fetchRemoteTextByLine(url)) { if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) { const domain = line.replace('server=/', '').replace('/114.114.114.114', ''); if (isDomainLoose(domain)) { diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 28e88cea..fe1ae99b 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -1,5 +1,5 @@ // @ts-check -import { fetchRemoteTextAndReadByLine } from './fetch-text-by-line'; +import { fetchRemoteTextByLine } from './fetch-text-by-line'; import { NetworkFilter } from '@cliqz/adblocker'; import { processLine } from './process-line'; import { getGorhillPublicSuffixPromise } from './get-gorhill-publicsuffix'; @@ -9,61 +9,79 @@ import { traceAsync } from './trace-runner'; import picocolors from 'picocolors'; import { normalizeDomain } from './normalize-domain'; import { fetchAssets } from './fetch-assets'; +import { deserializeSet, fsCache, serializeSet } from './cache-filesystem'; const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null let foundDebugDomain = false; -export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false) { - return traceAsync(`- processDomainLists: ${domainListsUrl}`, async () => { - const domainSets = new Set(); +export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { + return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply( + domainListsUrl, + async () => { + const domainSets = new Set(); - for await (const line of await fetchRemoteTextAndReadByLine(domainListsUrl)) { - const domainToAdd = processLine(line); - if (!domainToAdd) continue; + for await (const line of await fetchRemoteTextByLine(domainListsUrl)) { + const domainToAdd = processLine(line); + if (!domainToAdd) continue; - if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { - console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND)); - foundDebugDomain = true; - } + if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { + console.warn(picocolors.red(domainListsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND)); + foundDebugDomain = true; + } - domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); - } - - return domainSets; - }); -} - -export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false) { - return traceAsync(`- processHosts: ${hostsUrl}`, async () => { - const domainSets = new Set(); - - for await (const l of await fetchRemoteTextAndReadByLine(hostsUrl)) { - const line = processLine(l); - if (!line) { - continue; - } - - const domain = line.split(/\s/)[1]; - if (!domain) { - continue; - } - const _domain = domain.trim(); - - if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) { - console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND)); - foundDebugDomain = true; - } - - const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain); - if (domainToAdd) { domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); } + + return domainSets; + }, + { + ttl, + temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, + serializer: serializeSet, + deserializer: deserializeSet } + )); +} +export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) { + return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply( + hostsUrl, + async () => { + const domainSets = new Set(); - console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size)); + for await (const l of await fetchRemoteTextByLine(hostsUrl)) { + const line = processLine(l); + if (!line) { + continue; + } - return domainSets; - }); + const domain = line.split(/\s/)[1]; + if (!domain) { + continue; + } + const _domain = domain.trim(); + + if (DEBUG_DOMAIN_TO_FIND && _domain.includes(DEBUG_DOMAIN_TO_FIND)) { + console.warn(picocolors.red(hostsUrl), '(black)', picocolors.bold(DEBUG_DOMAIN_TO_FIND)); + foundDebugDomain = true; + } + + const domainToAdd = skipDomainCheck ? _domain : normalizeDomain(_domain); + if (domainToAdd) { + domainSets.add(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); + } + } + + console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size)); + + return domainSets; + }, + { + ttl, + temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, + serializer: serializeSet, + deserializer: deserializeSet + } + )); } // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe? @@ -77,90 +95,111 @@ const enum ParseType { export async function processFilterRules( filterRulesUrl: string, - fallbackUrls?: readonly string[] | undefined -): Promise<{ white: Set, black: Set, foundDebugDomain: boolean }> { - const whitelistDomainSets = new Set(); - const blacklistDomainSets = new Set(); + fallbackUrls?: readonly string[] | undefined | null, + ttl: number | null = null +): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { + const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[ + white: string[], + black: string[], + warningMessages: string[] + ]>( + filterRulesUrl, + async () => { + const whitelistDomainSets = new Set(); + const blacklistDomainSets = new Set(); - const warningMessages: string[] = []; + const warningMessages: string[] = []; - await traceAsync(`- processFilterRules: ${filterRulesUrl}`, async () => { - const gorhill = await getGorhillPublicSuffixPromise(); + const gorhill = await getGorhillPublicSuffixPromise(); - /** + /** * @param {string} line */ - const lineCb = (line: string) => { - const result = parse(line, gorhill); - if (!result) { - return; - } - - const flag = result[1]; - const hostname = result[0]; - - if (DEBUG_DOMAIN_TO_FIND) { - if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) { - console.warn( - picocolors.red(filterRulesUrl), - flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute - ? '(white)' - : '(black)', - picocolors.bold(DEBUG_DOMAIN_TO_FIND) - ); - foundDebugDomain = true; + const lineCb = (line: string) => { + const result = parse(line, gorhill); + if (!result) { + return; } - } - switch (flag) { - case ParseType.WhiteIncludeSubdomain: - if (hostname[0] !== '.') { - whitelistDomainSets.add(`.${hostname}`); - } else { + const flag = result[1]; + const hostname = result[0]; + + if (DEBUG_DOMAIN_TO_FIND) { + if (hostname.includes(DEBUG_DOMAIN_TO_FIND)) { + console.warn( + picocolors.red(filterRulesUrl), + flag === ParseType.WhiteIncludeSubdomain || flag === ParseType.WhiteAbsolute + ? '(white)' + : '(black)', + picocolors.bold(DEBUG_DOMAIN_TO_FIND) + ); + foundDebugDomain = true; + } + } + + switch (flag) { + case ParseType.WhiteIncludeSubdomain: + if (hostname[0] !== '.') { + whitelistDomainSets.add(`.${hostname}`); + } else { + whitelistDomainSets.add(hostname); + } + break; + case ParseType.WhiteAbsolute: whitelistDomainSets.add(hostname); - } - break; - case ParseType.WhiteAbsolute: - whitelistDomainSets.add(hostname); - break; - case ParseType.BlackAbsolute: - blacklistDomainSets.add(hostname); - break; - case ParseType.BlackIncludeSubdomain: - if (hostname[0] !== '.') { - blacklistDomainSets.add(`.${hostname}`); - } else { + break; + case ParseType.BlackAbsolute: blacklistDomainSets.add(hostname); - } - break; - case ParseType.ErrorMessage: - warningMessages.push(hostname); - break; - default: - break; - } - }; + break; + case ParseType.BlackIncludeSubdomain: + if (hostname[0] !== '.') { + blacklistDomainSets.add(`.${hostname}`); + } else { + blacklistDomainSets.add(hostname); + } + break; + case ParseType.ErrorMessage: + warningMessages.push(hostname); + break; + default: + break; + } + }; - if (!fallbackUrls || fallbackUrls.length === 0) { - for await (const line of await fetchRemoteTextAndReadByLine(filterRulesUrl)) { + // TODO-SUKKA: add cache here + if (!fallbackUrls || fallbackUrls.length === 0) { + for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) { // don't trim here - lineCb(line); - } - } else { - const filterRules = (await traceAsync( - picocolors.gray(`- download ${filterRulesUrl}`), - () => fetchAssets(filterRulesUrl, fallbackUrls), - picocolors.gray - )).split('\n'); + lineCb(line); + } + } else { + const filterRules = (await traceAsync( + picocolors.gray(`- download ${filterRulesUrl}`), + () => fetchAssets(filterRulesUrl, fallbackUrls), + picocolors.gray + )).split('\n'); - const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`); - console.time(key); - for (let i = 0, len = filterRules.length; i < len; i++) { - lineCb(filterRules[i]); + const key = picocolors.gray(`- parse adguard filter ${filterRulesUrl}`); + console.time(key); + for (let i = 0, len = filterRules.length; i < len; i++) { + lineCb(filterRules[i]); + } + console.timeEnd(key); } - console.timeEnd(key); + + return [ + Array.from(whitelistDomainSets), + Array.from(blacklistDomainSets), + warningMessages + ]; + }, + { + ttl, + temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, + serializer: JSON.stringify, + deserializer: JSON.parse } - }); + )); warningMessages.forEach(msg => { console.warn( @@ -172,13 +211,13 @@ export async function processFilterRules( console.log( picocolors.gray('[process filter]'), picocolors.gray(filterRulesUrl), - picocolors.gray(`white: ${whitelistDomainSets.size}`), - picocolors.gray(`black: ${blacklistDomainSets.size}`) + picocolors.gray(`white: ${white.length}`), + picocolors.gray(`black: ${black.length}`) ); return { - white: whitelistDomainSets, - black: blacklistDomainSets, + white, + black, foundDebugDomain }; } diff --git a/Build/lib/process-line.ts b/Build/lib/process-line.ts index 7b938006..9124ca1f 100644 --- a/Build/lib/process-line.ts +++ b/Build/lib/process-line.ts @@ -4,7 +4,7 @@ export const processLine = (line: string): string | null => { } const trimmed: string = line.trim(); - if (trimmed === '') { + if (trimmed.length === 0) { return null; } diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 744ac0be..06818627 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -7,11 +7,11 @@ export const HOSTS = [ ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true], // CoinBlockerList - ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true], + // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl + ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000], // Curben's UrlHaus Malicious URL Blocklist // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', - // 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/urlhaus-filter/urlhaus-filter-online.txt', ['https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', true, true], // Curben's Phishing URL Blocklist // Covered by lib/get-phishing-domains.ts @@ -21,14 +21,24 @@ export const HOSTS = [ // Curben's PUP Domains Blocklist // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' // 'https://pup-filter.pages.dev/pup-filter-agh.txt' - ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true], + // The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl + ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000], // BarbBlock - ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true] + // The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl + ['https://paulgb.github.io/BarbBlock/blacklists/hosts-file.txt', true, true, 10 * 24 * 60 * 60 * 1000] ] as const; export const DOMAIN_LISTS = [ // DigitalSide Threat-Intel - OSINT Hub - ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true] + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true], + + // AdGuard CNAME Filter Combined + // Update on a 7 days basis, so we add a 36 hours cache ttl + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000] ] as const; export const ADGUARD_FILTERS = [ @@ -41,7 +51,8 @@ export const ADGUARD_FILTERS = [ 'https://secure.fanboy.co.nz/easylist.txt', 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt', 'https://ublockorigin.pages.dev/thirdparties/easylist.txt' - ] + ], + 12 * 60 * 60 * 1000 ], // EasyPrivacy [ @@ -52,7 +63,8 @@ export const ADGUARD_FILTERS = [ 'https://easylist-downloads.adblockplus.org/easyprivacy.txt', 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt', 'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt' - ] + ], + 12 * 60 * 60 * 1000 ], // AdGuard DNS Filter [ @@ -62,12 +74,6 @@ export const ADGUARD_FILTERS = [ 'https://adguardteam.github.io/HostlistsRegistry/assets/filter_1.txt' ] ], - // AdGuard CNAME Filter Combined - 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads.txt', - 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers.txt', - 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs.txt', - 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites.txt', - 'https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers.txt', // uBlock Origin Filter List [ 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt', diff --git a/Build/validate-domestic.ts b/Build/validate-domestic.ts index f9015739..9f9135a4 100644 --- a/Build/validate-domestic.ts +++ b/Build/validate-domestic.ts @@ -1,4 +1,4 @@ -import { fetchRemoteTextAndReadByLine, readFileByLine } from './lib/fetch-text-by-line'; +import { fetchRemoteTextByLine, readFileByLine } from './lib/fetch-text-by-line'; import { Readable } from 'stream'; import { parse } from 'csv-parse'; import { createTrie } from './lib/trie'; @@ -7,7 +7,7 @@ import { processLine } from './lib/process-line'; export const parseDomesticList = async () => { const set = new Set(); - for await (const line of await fetchRemoteTextAndReadByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) { + for await (const line of await fetchRemoteTextByLine('https://raw.githubusercontent.com/felixonmars/dnsmasq-china-list/master/accelerated-domains.china.conf')) { if (line.startsWith('server=/') && line.endsWith('/114.114.114.114')) { const domain = line.slice(8, -16); set.add(domain); diff --git a/Source/domainset/cdn.conf b/Source/domainset/cdn.conf index 78032a85..85c4c8b4 100644 --- a/Source/domainset/cdn.conf +++ b/Source/domainset/cdn.conf @@ -2266,3 +2266,4 @@ ocecdn.oraclecloud.com assets.humix.com .nelreports.net static.mediafire.com +player.louisvuitton.com