diff --git a/Build/build-speedtest-domainset.ts b/Build/build-speedtest-domainset.ts index 77db7b0a..9f0e1edf 100644 --- a/Build/build-speedtest-domainset.ts +++ b/Build/build-speedtest-domainset.ts @@ -12,17 +12,17 @@ import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix'; import picocolors from 'picocolors'; import { fetchRemoteTextByLine } from './lib/fetch-text-by-line'; import { processLine } from './lib/process-line'; +import { TTL, deserializeArray, fsCache, serializeArray } from './lib/cache-filesystem'; const s = new Sema(2); const latestTopUserAgentsPromise = fetchWithRetry('https://unpkg.com/top-user-agents@latest/index.json') - .then(res => res.json()); + .then(res => res.json()).then(userAgents => userAgents.filter(ua => ua.startsWith('Mozilla/5.0 '))); const querySpeedtestApi = async (keyword: string): Promise> => { - const topUserAgents = (await Promise.all([ - latestTopUserAgentsPromise, - s.acquire() - ]))[0]; + const topUserAgents = await latestTopUserAgentsPromise; + + const url = `https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`; try { const randomUserAgent = topUserAgents[Math.floor(Math.random() * topUserAgents.length)]; @@ -30,39 +30,51 @@ const querySpeedtestApi = async (keyword: string): Promise> console.log(key); console.time(key); - const res = await fetchWithRetry(`https://www.speedtest.net/api/js/servers?engine=js&search=${keyword}&limit=100`, { - headers: { - dnt: '1', - Referer: 'https://www.speedtest.net/', - accept: 'application/json, text/plain, */*', - 'User-Agent': randomUserAgent, - 'Accept-Language': 'en-US,en;q=0.9', - ...(randomUserAgent.includes('Chrome') - ? { - 'Sec-Ch-Ua-Mobile': '?0', - 'Sec-Fetch-Dest': 'empty', - 'Sec-Fetch-Mode': 'cors', - 'Sec-Fetch-Site': 'same-origin', - 'Sec-Gpc': '1' + const json = await fsCache.apply( + url, + () => s.acquire().then(() => fetchWithRetry(url, { + headers: { + dnt: '1', + Referer: 'https://www.speedtest.net/', + accept: 'application/json, text/plain, */*', + 'User-Agent': randomUserAgent, + 'Accept-Language': 'en-US,en;q=0.9', + ...(randomUserAgent.includes('Chrome') + ? { + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Fetch-Dest': 'empty', + 'Sec-Fetch-Mode': 'cors', + 'Sec-Fetch-Site': 'same-origin', + 'Sec-Gpc': '1' + } + : {}) + }, + signal: AbortSignal.timeout(1000 * 4), + retry: { + retries: 2 + } + })).then(r => r.json>()).then(data => data.reduce( + (prev, cur) => { + const hn = tldts.getHostname(cur.url, { detectIp: false }); + if (hn) { + prev.push(hn); } - : {}) - }, - signal: AbortSignal.timeout(1000 * 4), - retry: { - retries: 2 + return prev; + }, [] + )).finally(() => s.release()), + { + ttl: TTL.ONE_WEEK(), + serializer: serializeArray, + deserializer: deserializeArray } - }); - - const json = await res.json>(); + ); console.timeEnd(key); - return json.map(({ url }) => tldts.getHostname(url, { detectIp: false })); + return json; } catch (e) { console.log(e); return []; - } finally { - s.release(); } }; diff --git a/Build/download-mock-assets.ts b/Build/download-mock-assets.ts index 9cee2264..19952ee7 100644 --- a/Build/download-mock-assets.ts +++ b/Build/download-mock-assets.ts @@ -4,10 +4,10 @@ import path from 'path'; import { fetchWithRetry } from './lib/fetch-retry'; const ASSETS_LIST = { - 'www-google-analytics-com_ga.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics-ga.js', - 'www-googletagservices-com_gpt.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googletagservices-gpt.js', - 'www-google-analytics-com_analytics.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/google-analytics.js', - 'www-googlesyndication-com_adsbygoogle.js': 'https://unpkg.com/@adguard/scriptlets@1/dist/redirect-files/googlesyndication-adsbygoogle.js' + 'www-google-analytics-com_ga.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics-ga.js', + 'www-googletagservices-com_gpt.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googletagservices-gpt.js', + 'www-google-analytics-com_analytics.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/google-analytics.js', + 'www-googlesyndication-com_adsbygoogle.js': 'https://raw.githubusercontent.com/AdguardTeam/Scriptlets/master/dist/redirect-files/googlesyndication-adsbygoogle.js' } as const; const mockDir = path.resolve(import.meta.dir, '../Mock'); diff --git a/Build/download-publicsuffixlist.ts b/Build/download-publicsuffixlist.ts index 8ce61fd4..150189fb 100644 --- a/Build/download-publicsuffixlist.ts +++ b/Build/download-publicsuffixlist.ts @@ -1,5 +1,20 @@ +import { TTL, fsCache } from './lib/cache-filesystem'; import { defaultRequestInit, fetchWithRetry } from './lib/fetch-retry'; import { createMemoizedPromise } from './lib/memo-promise'; import { traceAsync } from './lib/trace-runner'; -export const getPublicSuffixListTextPromise = createMemoizedPromise(() => traceAsync('obtain public_suffix_list', () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()))); +export const getPublicSuffixListTextPromise = createMemoizedPromise( + () => traceAsync( + 'obtain public_suffix_list', + () => fsCache.apply( + 'https://publicsuffix.org/list/public_suffix_list.dat', + () => fetchWithRetry('https://publicsuffix.org/list/public_suffix_list.dat', defaultRequestInit).then(r => r.text()), + { + // https://github.com/publicsuffix/list/blob/master/.github/workflows/tld-update.yml + // Though the action runs every 24 hours, the IANA list is updated every 7 days. + // So a 3 day TTL should be enough. + ttl: TTL.THREE_DAYS() + } + ) + ) +); diff --git a/Build/lib/cache-filesystem.ts b/Build/lib/cache-filesystem.ts index 6a1fecfc..935452cb 100644 --- a/Build/lib/cache-filesystem.ts +++ b/Build/lib/cache-filesystem.ts @@ -127,12 +127,28 @@ export class Cache { } } -// export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }); +export const fsCache = new Cache({ cachePath: path.resolve(import.meta.dir, '../../.cache') }); // process.on('exit', () => { // fsCache.destroy(); // }); -const separator = String.fromCharCode(0); +const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min; +// Add some randomness to the cache ttl to avoid thundering herd +export const TTL = { + TWLVE_HOURS: () => randomInt(9, 14) * 60 * 60 * 1000, + THREE_DAYS: () => randomInt(2, 4) * 24 * 60 * 60 * 1000, + ONE_WEEK: () => randomInt(5, 8) * 24 * 60 * 60 * 1000, + TWO_WEEKS: () => randomInt(12, 16) * 24 * 60 * 60 * 1000, + TEN_DAYS: () => randomInt(9, 11) * 24 * 60 * 60 * 1000 +}; + +const separator = String.fromCharCode(0); +// const textEncoder = new TextEncoder(); +// const textDecoder = new TextDecoder(); +// export const serializeString = (str: string) => textEncoder.encode(str); +// export const deserializeString = (str: string) => textDecoder.decode(new Uint8Array(str.split(separator).map(Number))); export const serializeSet = (set: Set) => Array.from(set).join(separator); export const deserializeSet = (str: string) => new Set(str.split(separator)); +export const serializeArray = (arr: string[]) => arr.join(separator); +export const deserializeArray = (str: string) => str.split(separator); diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index fba447ab..9035f1b2 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -9,14 +9,15 @@ import { traceAsync } from './trace-runner'; import picocolors from 'picocolors'; import { normalizeDomain } from './normalize-domain'; import { fetchAssets } from './fetch-assets'; +import { deserializeSet, fsCache, serializeSet } from './cache-filesystem'; const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null let foundDebugDomain = false; -export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, _ttl: number | null = null) { - return traceAsync(`- processDomainLists: ${domainListsUrl}`, /* () => fsCache.apply( +export function processDomainLists(domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { + return traceAsync(`- processDomainLists: ${domainListsUrl}`, () => fsCache.apply( domainListsUrl, - */async () => { + async () => { const domainSets = new Set(); for await (const line of await fetchRemoteTextByLine(domainListsUrl)) { @@ -32,19 +33,19 @@ export function processDomainLists(domainListsUrl: string, includeAllSubDomain = } return domainSets; - });/* , + }, { ttl, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, serializer: serializeSet, deserializer: deserializeSet } - )); */ + )); } -export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, _ttl: number | null = null) { - return traceAsync(`- processHosts: ${hostsUrl}`, /* () => fsCache.apply( +export function processHosts(hostsUrl: string, includeAllSubDomain = false, skipDomainCheck = false, ttl: number | null = null) { + return traceAsync(`- processHosts: ${hostsUrl}`, () => fsCache.apply( hostsUrl, - */async () => { + async () => { const domainSets = new Set(); for await (const l of await fetchRemoteTextByLine(hostsUrl)) { @@ -73,14 +74,14 @@ export function processHosts(hostsUrl: string, includeAllSubDomain = false, skip console.log(picocolors.gray('[process hosts]'), picocolors.gray(hostsUrl), picocolors.gray(domainSets.size)); return domainSets; - }); - /* { + }, + { ttl, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, serializer: serializeSet, deserializer: deserializeSet } - ) */ + )); } // eslint-disable-next-line sukka-ts/no-const-enum -- bun bundler is smart, maybe? @@ -95,15 +96,15 @@ const enum ParseType { export async function processFilterRules( filterRulesUrl: string, fallbackUrls?: readonly string[] | undefined | null, - _ttl: number | null = null + ttl: number | null = null ): Promise<{ white: string[], black: string[], foundDebugDomain: boolean }> { - const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, /* () => fsCache.apply<[ + const [white, black, warningMessages] = await traceAsync(`- processFilterRules: ${filterRulesUrl}`, () => fsCache.apply<[ white: string[], black: string[], warningMessages: string[] ]>( filterRulesUrl, - */async () => { + async () => { const whitelistDomainSets = new Set(); const blacklistDomainSets = new Set(); @@ -168,7 +169,7 @@ export async function processFilterRules( // TODO-SUKKA: add cache here if (!fallbackUrls || fallbackUrls.length === 0) { for await (const line of await fetchRemoteTextByLine(filterRulesUrl)) { - // don't trim here + // don't trim here lineCb(line); } } else { @@ -191,14 +192,14 @@ export async function processFilterRules( Array.from(blacklistDomainSets), warningMessages ]; - }); - /* { + }, + { ttl, temporaryBypass: DEBUG_DOMAIN_TO_FIND !== null, serializer: JSON.stringify, deserializer: JSON.parse } - ) */ + )); warningMessages.forEach(msg => { console.warn( diff --git a/Build/lib/random-int.bench.ts b/Build/lib/random-int.bench.ts new file mode 100644 index 00000000..d6497c60 --- /dev/null +++ b/Build/lib/random-int.bench.ts @@ -0,0 +1,16 @@ +import { bench, group, run } from 'mitata'; +import { randomInt as nativeRandomInt } from 'crypto'; + +const randomInt = (min: number, max: number) => Math.floor(Math.random() * (max - min + 1)) + min; + +group('random-int', () => { + bench('crypto.randomInt', () => { + nativeRandomInt(3, 7); + }); + + bench('Math.random', () => { + randomInt(3, 7); + }); +}); + +run(); diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 50e83fdb..cbd44413 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -1,14 +1,20 @@ +import { TTL } from './cache-filesystem'; + export const HOSTS = [ ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true], ['https://someonewhocares.org/hosts/hosts', true], - ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false], - ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true], + // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl + ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false, false, TTL.THREE_DAYS()], + // have not been updated for more than a year, so we set a 14 days cache ttl + ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', true, false, TTL.TWO_WEEKS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', false], - ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false], + ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-Extension.txt', false], + // ad-wars is not actively maintained, so we set a 7 days cache ttl + ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', false, false, TTL.ONE_WEEK()], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', true], // CoinBlockerList - // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 10 days cache ttl - ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, 10 * 24 * 60 * 60 * 1000], + // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl + ['https://zerodot1.gitlab.io/CoinBlockerLists/hosts_browser', true, true, TTL.TWO_WEEKS()], // Curben's UrlHaus Malicious URL Blocklist // 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', // 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', @@ -21,23 +27,24 @@ export const HOSTS = [ // Curben's PUP Domains Blocklist // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' // 'https://pup-filter.pages.dev/pup-filter-agh.txt' - // The PUP filter has paused the update since 2023-05, so we set a 7 days cache ttl - ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, 7 * 24 * 60 * 60 * 1000] + // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl + ['https://curbengh.github.io/pup-filter/pup-filter-hosts.txt', true, true, TTL.TWO_WEEKS()] ] as const; export const DOMAIN_LISTS = [ // BarbBlock - // The barbblock list has never been updated since 2019-05, so we set a 10 days cache ttl - ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, 10 * 24 * 60 * 60 * 1000], + // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl + ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()], // DigitalSide Threat-Intel - OSINT Hub - ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true], + // Update once per day + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, 24 * 60 * 60 * 1000], // AdGuard CNAME Filter Combined - // Update on a 7 days basis, so we add a 36 hours cache ttl - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, 36 * 60 * 60 * 1000], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, 36 * 60 * 60 * 1000], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, 36 * 60 * 60 * 1000], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, 36 * 60 * 60 * 1000] + // Update on a 7 days basis, so we add a 3 hours cache ttl + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()] ] as const; export const ADGUARD_FILTERS = [ @@ -130,14 +137,17 @@ export const ADGUARD_FILTERS = [ // GameConsoleAdblockList 'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', // PiHoleBlocklist + // Update almost once per 3 months, let's set a 10 days cache ttl [ 'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt', [ 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt' - ] + ], + TTL.TEN_DAYS() ], // Spam404 - 'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', + // Not actively maintained, let's use a 10 days cache ttl + ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()], // Brave First Party & First Party CNAME 'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt' ] as const; diff --git a/bun.lockb b/bun.lockb index 9bb22770..8dba67e4 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index e09d829f..d6a4a65b 100644 --- a/package.json +++ b/package.json @@ -39,6 +39,7 @@ "eslint": "^8.56.0", "eslint-config-sukka": "4.1.10-beta.2", "eslint-formatter-sukka": "4.1.9", + "mitata": "^0.1.6", "typescript": "^5.3.3" }, "resolutions": {