From 0d1fddcb813dad8d5f6ded34975a7f03a028394f Mon Sep 17 00:00:00 2001 From: SukkaW Date: Mon, 30 Sep 2024 08:05:47 +0800 Subject: [PATCH] Update Reject Infra & Data Source --- Build/build-reject-domainset.ts | 50 ++++++-------- Build/constants/reject-data-source.ts | 95 +++++++++++++++++---------- Build/lib/get-phishing-domains.ts | 6 +- Build/lib/parse-filter.ts | 14 +++- Build/lib/rules/ruleset.ts | 2 +- 5 files changed, 98 insertions(+), 69 deletions(-) diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 69421b2c..26791e27 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -4,7 +4,7 @@ import process from 'node:process'; import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter'; -import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source'; +import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source'; import { compareAndWriteFile } from './lib/create-file'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { task } from './trace'; @@ -44,7 +44,8 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as ...HOSTS_EXTRA.map(host => ` - ${host[0]}`), ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`), ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`), - ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`) + ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`), + ...PHISHING_HOSTS_EXTRA.map(host => ` - ${host[0]}`) ]); const appendArrayToRejectOutput = rejectOutput.addFromDomainset.bind(rejectOutput); @@ -91,18 +92,25 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as appendArrayToRejectExtraOutput(black); }) ), - - ([ - 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', - 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' - ].map( - input => processFilterRules(childSpan, input).then(({ white, black }) => { - setAddFromArray(filterRuleWhitelistDomainSets, white); - setAddFromArray(filterRuleWhitelistDomainSets, black); - }) - )), + ADGUARD_FILTERS_WHITELIST.map(entry => processFilterRules(childSpan, ...entry).then(({ white, black }) => { + setAddFromArray(filterRuleWhitelistDomainSets, white); + setAddFromArray(filterRuleWhitelistDomainSets, black); + })), getPhishingDomains(childSpan).then(appendArrayToRejectExtraOutput), - readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput) + readFileIntoProcessedArray(path.join(SOURCE_DIR, 'domainset/reject_sukka.conf')).then(appendArrayToRejectOutput), + // Dedupe domainSets + span.traceChildAsync('collect black keywords/suffixes', async () => { + /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */ + for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) { + const [type, value] = line.split(','); + if (type === 'DOMAIN-KEYWORD') { + rejectOutput.addDomainKeyword(value); // Add for later deduplication + rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication + } else if (type === 'DOMAIN-SUFFIX') { + filterRuleWhitelistDomainSets.add('.' + value); + } + } + }) ].flat()); // eslint-disable-next-line sukka/no-single-return -- not single return return shouldStop; @@ -112,22 +120,6 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as process.exit(1); } - // Dedupe domainSets - await span.traceChildAsync('collect black keywords/suffixes', async () => { - /** Collect DOMAIN-KEYWORD from non_ip/reject.conf for deduplication */ - for await (const line of readFileByLine(path.resolve(__dirname, '../Source/non_ip/reject.conf'))) { - const [type, value] = line.split(','); - - if (type === 'DOMAIN-KEYWORD') { - rejectOutput.addDomainKeyword(value); // Add for later deduplication - rejectExtraOutput.addDomainKeyword(value); // Add for later deduplication - } else if (type === 'DOMAIN-SUFFIX') { - rejectOutput.whitelistDomain('.' + value); // Add for later deduplication - rejectExtraOutput.whitelistDomain('.' + value); // Add for later deduplication - } - } - }); - await Promise.all([ rejectOutput.done(), rejectExtraOutput.done() diff --git a/Build/constants/reject-data-source.ts b/Build/constants/reject-data-source.ts index 2eaa4811..170eb480 100644 --- a/Build/constants/reject-data-source.ts +++ b/Build/constants/reject-data-source.ts @@ -3,14 +3,12 @@ import { TTL } from '../lib/cache-filesystem'; type HostsSource = [main: string, mirrors: string[] | null, includeAllSubDomain: boolean, ttl: number]; export const HOSTS: HostsSource[] = [ - // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl - ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()], // have not been updated for more than a year, so we set a 14 days cache ttl ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.ONE_WEEK()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.ONE_WEEK()], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()], - ['https://raw.githubusercontent.com/durablenapkin/block/refs/heads/master/tvstream.txt', null, true, TTL.THREE_HOURS()] + ['https://raw.githubusercontent.com/durablenapkin/block/master/tvstream.txt', null, true, TTL.THREE_HOURS()] ]; export const HOSTS_EXTRA: HostsSource[] = [ @@ -24,7 +22,7 @@ export const HOSTS_EXTRA: HostsSource[] = [ // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()], // ad-wars is not actively maintained, so we set a 7 days cache ttl - ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()] + ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()] ]; export const DOMAIN_LISTS: HostsSource[] = [ @@ -86,9 +84,12 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [ 'https://phishing.army/download/phishing_army_blocklist.txt', [], true, TTL.THREE_HOURS() - ], + ] +]; + +export const PHISHING_HOSTS_EXTRA: HostsSource[] = [ [ - 'https://raw.githubusercontent.com/durablenapkin/scamblocklist/refs/heads/master/hosts.txt', + 'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt', [], true, TTL.TWLVE_HOURS() ] @@ -97,14 +98,16 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [ type AdGuardFilterSource = [main: string, mirrors: string[] | null, ttl: number, allowThirdParty?: boolean]; export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ + // no coin list adguard list is more maintained than its hosts + ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/nocoin.txt', [], TTL.TWO_WEEKS()], // EasyList [ 'https://easylist.to/easylist/easylist.txt', [ - 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt', - 'https://ublockorigin.pages.dev/thirdparties/easylist.txt', 'https://easylist-downloads.adblockplus.org/easylist.txt', 'https://secure.fanboy.co.nz/easylist.txt', + 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist.txt', + 'https://ublockorigin.pages.dev/thirdparties/easylist.txt', 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt' ], TTL.TWLVE_HOURS() @@ -113,11 +116,11 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ [ 'https://easylist.to/easylist/easyprivacy.txt', [ - 'https://secure.fanboy.co.nz/easyprivacy.txt', - 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt', 'https://easylist-downloads.adblockplus.org/easyprivacy.txt', + 'https://secure.fanboy.co.nz/easyprivacy.txt', 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easyprivacy.txt', - 'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt' + 'https://ublockorigin.pages.dev/thirdparties/easyprivacy.txt', + 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt' ], TTL.TWLVE_HOURS() ], @@ -150,23 +153,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ ], TTL.TEN_DAYS() ], - // Brave First Party & First Party CNAME - ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()] + // uBlock Origin Unbreak + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt', + [ + 'https://ublockorigin.pages.dev/filters/unbreak.min.txt' + ], + TTL.THREE_HOURS() + ] +]; + +export const ADGUARD_FILTERS_WHITELIST: AdGuardFilterSource[] = [ + [ + 'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exceptions.txt', + [ + 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt' + ], + TTL.THREE_HOURS() + ], + [ + 'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/exclusions.txt', + [ + 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' + ], + TTL.THREE_HOURS() + ] ]; export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ // AdGuard Annoyances filter - ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true], - // AdGuard Cookie Notices - ['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true], - // EasyList Germany filter - [ - 'https://easylist.to/easylistgermany/easylistgermany.txt', - [ - 'https://easylist-downloads.adblockplus.org/easylistgermany.txt' - ], - TTL.TWLVE_HOURS() - ], + ['https://filters.adtidy.org/extension/ublock/filters/14_optimized.txt', null, TTL.THREE_HOURS(), true], + // AdGuard Cookie Notices, included in Annoyances filter + // ['https://filters.adtidy.org/extension/ublock/filters/18_optimized.txt', null, TTL.THREE_HOURS(), true], + // EasyList Germany filter, not even included in extra for now + // [ + // 'https://easylist.to/easylistgermany/easylistgermany.txt', + // [ + // 'https://easylist-downloads.adblockplus.org/easylistgermany.txt' + // ], + // TTL.TWLVE_HOURS() + // ], // AdGuard Japanese filter ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()], // uBlock Origin Filter List @@ -177,8 +203,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ ], TTL.THREE_HOURS() ], - // AdGuard Popup Overlay - ['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true], + // AdGuard Popup Overlay - included in Annoyances filter + // ['https://filters.adtidy.org/extension/ublock/filters/19_optimized.txt', null, TTL.THREE_HOURS(), true], // AdGuard Mobile Banner // almost all generic rule // ['https://filters.adtidy.org/extension/ublock/filters/20_optimized.txt', null, TTL.THREE_HOURS()], @@ -205,14 +231,6 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ // 'https://ublockorigin.pages.dev/filters/resource-abuse.txt' // ] // ], - // uBlock Origin Unbreak - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.min.txt', - [ - 'https://ublockorigin.pages.dev/filters/unbreak.min.txt' - ], - TTL.THREE_HOURS() - ], // uBlock Origin Annoyances [ 'https://ublockorigin.github.io/uAssetsCDN/filters/annoyances.min.txt', @@ -229,6 +247,8 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ ], TTL.THREE_HOURS() ], + // Dandelion Sprout's Annoyances + ['https://filters.adtidy.org/extension/ublock/filters/250_optimized.txt', null, TTL.THREE_HOURS(), true], // EasyList - Newsletters [ 'https://ublockorigin.github.io/uAssetsCDN/thirdparties/easylist-newsletters.txt', @@ -253,6 +273,12 @@ export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ 'https://secure.fanboy.co.nz/fanboy-cookiemonster_ubo.txt' ], TTL.TWLVE_HOURS() + ], + // Bypass Paywall Cleaner + [ + 'https://gitflic.ru/project/magnolia1234/bypass-paywalls-clean-filters/blob/raw?file=bpc-paywall-filter.txt', + [], + TTL.ONE_DAY() ] ]; @@ -269,6 +295,7 @@ export const PREDEFINED_WHITELIST = [ '.ip6-allhosts', '.mcastprefix', '.skk.moe', + '.cdn.cloudflare.net', // Surge/Clash doesn't support CNAME 'analytics.google.com', '.cloud.answerhub.com', 'ae01.alicdn.com', diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index b6b49ee4..899f0071 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,10 +1,10 @@ -import { processDomainLists } from './parse-filter'; +import { processDomainLists, processHosts } from './parse-filter'; import * as tldts from 'tldts-experimental'; import { dummySpan } from '../trace'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; -import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source'; +import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source'; import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt'; import picocolors from 'picocolors'; import createKeywordFilter from './aho-corasick'; @@ -162,6 +162,8 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey)))) .forEach(appendArrayInPlaceCurried(domainArr)); + (await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey)))) + .forEach(appendArrayInPlaceCurried(domainArr)); return domainArr; }); diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 0b18f5de..39f0514e 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -45,7 +45,11 @@ const domainListLineCb = (l: string, set: string[], includeAllSubDomain: boolean const cacheKey = createCacheKey(__filename); -export function processDomainLists(span: Span, domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null, extraCacheKey: (input: string) => string = identity) { +export function processDomainLists( + span: Span, + domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, + ttl: number | null = null, extraCacheKey: (input: string) => string = identity +) { return span.traceChild(`process domainlist: ${domainListsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply( extraCacheKey(cacheKey(domainListsUrl)), async () => { @@ -100,9 +104,13 @@ const hostsLineCb = (l: string, set: string[], includeAllSubDomain: boolean, met set.push(includeAllSubDomain ? `.${domain}` : domain); }; -export function processHosts(span: Span, hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, ttl: number | null = null) { +export function processHosts( + span: Span, + hostsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, + ttl: number | null = null, extraCacheKey: (input: string) => string = identity +) { return span.traceChild(`processhosts: ${hostsUrl}`).traceAsyncFn((childSpan) => fsFetchCache.apply( - cacheKey(hostsUrl), + extraCacheKey(cacheKey(hostsUrl)), async () => { const domainSets: string[] = []; diff --git a/Build/lib/rules/ruleset.ts b/Build/lib/rules/ruleset.ts index 74e7ba1b..ebacfde6 100644 --- a/Build/lib/rules/ruleset.ts +++ b/Build/lib/rules/ruleset.ts @@ -214,8 +214,8 @@ export class RulesetOutput extends RuleOutput { } } - console.error(picocolors.bold('Parsed Failed')); if (parsedFailures.length > 0) { + console.error(picocolors.bold('Parsed Failed')); console.table(parsedFailures); }