From e4429a62eebfbad6ddf7399858bcd9581725c05b Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 26 May 2024 18:40:03 +0800 Subject: [PATCH] Chore: prefer domain list --- Build/build-reject-domainset.ts | 6 +-- Build/lib/append-array-in-place.ts | 2 + Build/lib/get-phishing-domains.ts | 11 +++-- Build/lib/parse-filter.ts | 41 ++++++++++++------ Build/lib/reject-data-source.ts | 69 +++++++++++++++++++----------- 5 files changed, 82 insertions(+), 47 deletions(-) diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index a5c9ad00..1a3492ce 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -36,14 +36,14 @@ export const buildRejectDomainSet = task(import.meta.path, async (span) => { let shouldStop = false; await Promise.all([ // Parse from remote hosts & domain lists - ...HOSTS.map(entry => processHosts(childSpan, entry[0], entry[1], entry[2], entry[3]).then(setAddFromArrayCurried(domainSets))), + ...HOSTS.map(entry => processHosts(childSpan, ...entry).then(setAddFromArrayCurried(domainSets))), - ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, entry[0], entry[1], entry[2]).then(setAddFromArrayCurried(domainSets))), + ...DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(setAddFromArrayCurried(domainSets))), ...ADGUARD_FILTERS.map(input => ( typeof input === 'string' ? processFilterRules(childSpan, input) - : processFilterRules(childSpan, input[0], input[1], input[2]) + : processFilterRules(childSpan, ...input) ).then(({ white, black, foundDebugDomain }) => { if (foundDebugDomain) { // eslint-disable-next-line sukka/no-single-return -- not single return diff --git a/Build/lib/append-array-in-place.ts b/Build/lib/append-array-in-place.ts index a7b2e8f7..f022f7ce 100644 --- a/Build/lib/append-array-in-place.ts +++ b/Build/lib/append-array-in-place.ts @@ -19,3 +19,5 @@ export function appendArrayInPlace(dest: T[], source: T[]) { } return dest; } + +export const appendArrayInPlaceCurried = (dest: T[]) => (source: T[]) => appendArrayInPlace(dest, source); diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 76e05b85..8faf5425 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -4,7 +4,8 @@ import { getSubdomain, getPublicSuffix } from 'tldts-experimental'; import { TTL } from './cache-filesystem'; import type { Span } from '../trace'; -import { appendArrayInPlace } from './append-array-in-place'; +import { appendArrayInPlace, appendArrayInPlaceCurried } from './append-array-in-place'; +import { PHISHING_DOMAIN_LISTS } from './reject-data-source'; const BLACK_TLD = new Set([ 'accountant', @@ -101,12 +102,10 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g const gorhill = await getGorhillPublicSuffixPromise(); const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { - const [domainSet, domainSet2] = await Promise.all([ - processDomainLists(curSpan, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()), - processDomainLists(curSpan, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS()) - ]); + const domainSet: string[] = []; - appendArrayInPlace(domainSet, domainSet2); + (await Promise.all(PHISHING_DOMAIN_LISTS.map(entry => processDomainLists(curSpan, ...entry)))) + .forEach(appendArrayInPlaceCurried(domainSet)); return domainSet; }); diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 3c6e52fb..4db3cd23 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -16,24 +16,41 @@ const DEBUG_DOMAIN_TO_FIND: string | null = null; // example.com | null let foundDebugDomain = false; const temporaryBypass = DEBUG_DOMAIN_TO_FIND !== null; -export function processDomainLists(span: Span, domainListsUrl: string, includeAllSubDomain = false, ttl: number | null = null) { - return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn(() => fsFetchCache.apply( +const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean, meta: string) => { + let line = processLine(l); + if (!line) return; + + line = normalizeDomain(line); + if (!line) return; + + if (DEBUG_DOMAIN_TO_FIND && line.includes(DEBUG_DOMAIN_TO_FIND)) { + console.warn(picocolors.red(meta), '(black)', line.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); + foundDebugDomain = true; + } + + set.push(includeAllSubDomain ? `.${line}` : line); +}; + +export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) { + return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply( domainListsUrl, async () => { const domainSets: string[] = []; - for await (const line of await fetchRemoteTextByLine(domainListsUrl)) { - let domainToAdd = processLine(line); - if (!domainToAdd) continue; - domainToAdd = normalizeDomain(domainToAdd); - if (!domainToAdd) continue; - - if (DEBUG_DOMAIN_TO_FIND && domainToAdd.includes(DEBUG_DOMAIN_TO_FIND)) { - console.warn(picocolors.red(domainListsUrl), '(black)', domainToAdd.replaceAll(DEBUG_DOMAIN_TO_FIND, picocolors.bold(DEBUG_DOMAIN_TO_FIND))); - foundDebugDomain = true; + if (mirrors == null || mirrors.length === 0) { + for await (const l of await fetchRemoteTextByLine(domainListsUrl)) { + domainListLineCb(l, domainSets, includeAllSubDomain, domainListsUrl); } + } else { + const filterRules = await childSpan + .traceChild('download domain list') + .traceAsyncFn(() => fetchAssets(domainListsUrl, mirrors).then(text => text.split('\n'))); - domainSets.push(includeAllSubDomain ? `.${domainToAdd}` : domainToAdd); + childSpan.traceChild('parse domain list').traceSyncFn(() => { + for (let i = 0, len = filterRules.length; i < len; i++) { + domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); + } + }); } return domainSets; diff --git a/Build/lib/reject-data-source.ts b/Build/lib/reject-data-source.ts index 54c6d161..baf53748 100644 --- a/Build/lib/reject-data-source.ts +++ b/Build/lib/reject-data-source.ts @@ -19,46 +19,63 @@ export const HOSTS: HostsSource[] = [ ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()], // ad-wars is not actively maintained, so we set a 7 days cache ttl ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()], - ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()], - // Curben's UrlHaus Malicious URL Blocklist - [ - 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-hosts.txt', - [ - 'https://urlhaus-filter.pages.dev/urlhaus-filter-hosts.txt', - 'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-hosts.txt' - ], - true, - TTL.THREE_HOURS() - ] - // Curben's Phishing URL Blocklist - // Covered by lib/get-phishing-domains.ts - // 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt' - // 'https://phishing-filter.pages.dev/phishing-filter-agh.txt' - // ['https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', true, true], + ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()] ] as const; -export const DOMAIN_LISTS = [ +export const DOMAIN_LISTS: HostsSource[] = [ // CoinBlockerList // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl - ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', true, TTL.TWO_WEEKS()], + ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', [], true, TTL.TWO_WEEKS()], // BarbBlock // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl - ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', true, TTL.TWO_WEEKS()], + ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()], // DigitalSide Threat-Intel - OSINT Hub // Update once per day - ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', true, TTL.ONE_DAY()], + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()], // Curben's PUP Domains Blocklist // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' // 'https://pup-filter.pages.dev/pup-filter-agh.txt' // The PUP filter has paused the update since 2023-05, so we set a 14 days cache ttl - ['https://curbengh.github.io/pup-filter/pup-filter-domains.txt', true, TTL.TWO_WEEKS()], + [ + 'https://curbengh.github.io/pup-filter/pup-filter-domains.txt', + [ + 'https://pup-filter.pages.dev/pup-filter-domains.txt', + 'https://malware-filter.gitlab.io/pup-filter/pup-filter-domains.txt' + ], + true, TTL.TWO_WEEKS() + ], + // Curben's UrlHaus Malicious URL Blocklist + [ + 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-domains.txt', + [ + 'https://urlhaus-filter.pages.dev/urlhaus-filter-domains.txt', + 'https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-domains.txt' + ], + true, TTL.THREE_HOURS() + ], // AdGuard CNAME Filter Combined // Update on a 7 days basis, so we add a 3 hours cache ttl - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', true, TTL.THREE_DAYS()], - ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', true, TTL.THREE_DAYS()] + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', [], true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', [], true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', [], true, TTL.THREE_DAYS()], + ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()] +] as const; + +export const PHISHING_DOMAIN_LISTS: [HostsSource, HostsSource] = [ + [ + 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', + [ + 'https://phishing-filter.pages.dev/phishing-filter-domains.txt', + 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-domains.txt' + ], + true, TTL.THREE_HOURS() + ], + [ + 'https://phishing.army/download/phishing_army_blocklist.txt', + [], + true, TTL.THREE_HOURS() + ] ] as const; type AdGuardFilterSource = string | [main: string, mirrors: string[] | null, ttl: number];