diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index 4c7d1ab9..064725a5 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -4,7 +4,7 @@ import path from 'path'; import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter'; import { createTrie } from './lib/trie'; -import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS } from './constants/reject-data-source'; +import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA } from './constants/reject-data-source'; import { createRuleset, compareAndWriteFile } from './lib/create-file'; import { domainDeduper } from './lib/domain-deduper'; import createKeywordFilter from './lib/aho-corasick'; @@ -29,6 +29,9 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy const domainSets = new Set(); const appendArrayToDomainSets = setAddFromArrayCurried(domainSets); + const domainSetsExtra = new Set(); + const appendArrayToDomainSetsExtra = setAddFromArrayCurried(domainSetsExtra); + // Parse from AdGuard Filters const shouldStop = await span .traceChild('download and process hosts / adblock filter rules') @@ -38,7 +41,11 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy await Promise.all([ // Parse from remote hosts & domain lists HOSTS.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSets)), + HOSTS_EXTRA.map(entry => processHosts(childSpan, ...entry).then(appendArrayToDomainSetsExtra)), + DOMAIN_LISTS.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSets)), + DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(childSpan, ...entry).then(appendArrayToDomainSetsExtra)), + ADGUARD_FILTERS.map( input => processFilterRules(childSpan, ...input) .then(({ white, black, foundDebugDomain }) => { @@ -51,6 +58,19 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy setAddFromArray(domainSets, black); }) ), + ADGUARD_FILTERS_EXTRA.map( + input => processFilterRules(childSpan, ...input) + .then(({ white, black, foundDebugDomain }) => { + if (foundDebugDomain) { + // eslint-disable-next-line sukka/no-single-return -- not single return + shouldStop = true; + // we should not break here, as we want to see full matches from all data source + } + setAddFromArray(filterRuleWhitelistDomainSets, white); + setAddFromArray(domainSetsExtra, black); + }) + ), + ([ 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exceptions.txt', 'https://raw.githubusercontent.com/AdguardTeam/AdGuardSDNSFilter/master/Filters/exclusions.txt' @@ -60,7 +80,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy setAddFromArray(filterRuleWhitelistDomainSets, black); }) )), - getPhishingDomains(childSpan).then(appendArrayToDomainSets), + getPhishingDomains(childSpan).then(appendArrayToDomainSetsExtra), getRejectSukkaConfPromise.then(appendArrayToDomainSets) ].flat()); // eslint-disable-next-line sukka/no-single-return -- not single return @@ -71,7 +91,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy process.exit(1); } - console.log(`Import ${domainSets.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`); + console.log(`Import ${domainSets.size} + ${domainSetsExtra.size} rules from Hosts / AdBlock Filter Rules & reject_sukka.conf!`); // Dedupe domainSets const domainKeywordsSet = await span.traceChildAsync('collect black keywords/suffixes', async () => { @@ -91,25 +111,38 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy return domainKeywordsSet; }); - const trie = span.traceChildSync('create smol trie while deduping black keywords', () => { - const trie = createTrie(null, true, true); + const [baseTrie, extraTrie] = span.traceChildSync('create smol trie while deduping black keywords', () => { + const baseTrie = createTrie(null, true, true); + const extraTrie = createTrie(null, true, true); const kwfilter = createKeywordFilter(domainKeywordsSet); for (const domain of domainSets) { // exclude keyword when creating trie if (!kwfilter(domain)) { - trie.add(domain); + baseTrie.add(domain); } } - return trie; + for (const domain of domainSetsExtra) { + // exclude keyword when creating trie + if (!kwfilter(domain)) { + extraTrie.add(domain); + } + } + + return [baseTrie, extraTrie] as const; }); - span.traceChildSync('dedupe from white suffixes', () => filterRuleWhitelistDomainSets.forEach(trie.whitelist)); + span.traceChildSync('dedupe from white suffixes (base)', () => filterRuleWhitelistDomainSets.forEach(baseTrie.whitelist)); + span.traceChildSync('dedupe from white suffixes and base (extra)', () => { + domainSets.forEach(extraTrie.whitelist); + filterRuleWhitelistDomainSets.forEach(extraTrie.whitelist); + }); // Dedupe domainSets - const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain', () => domainDeduper(trie)); + const dudupedDominArray = span.traceChildSync('dedupe from covered subdomain (base)', () => domainDeduper(baseTrie)); + const dudupedDominArrayExtra = span.traceChildSync('dedupe from covered subdomain (extra)', () => domainDeduper(extraTrie)); console.log(`Final size ${dudupedDominArray.length}`); @@ -118,7 +151,7 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy subdomainMap: domainArraySubdomainMap } = span.traceChildSync( 'build map for stat and sort', - () => buildParseDomainMap(dudupedDominArray) + () => buildParseDomainMap(dudupedDominArray.concat(dudupedDominArrayExtra)) ); // Create reject stats @@ -136,30 +169,46 @@ export const buildRejectDomainSet = task(import.meta.main, import.meta.path)(asy return sort(Array.from(statMap.entries()).filter(a => a[1] > 9), (a, b) => (b[1] - a[1]) || a[0].localeCompare(b[0])); }); - const description = [ - ...SHARED_DESCRIPTION, - '', - 'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining', - '', - 'Build from:', - ...HOSTS.map(host => ` - ${host[0]}`), - ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`), - ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`), - ' - https://curbengh.github.io/phishing-filter/phishing-filter-hosts.txt', - ' - https://phishing.army/download/phishing_army_blocklist.txt' - ]; - return Promise.all([ createRuleset( span, 'Sukka\'s Ruleset - Reject Base', - description, + [ + ...SHARED_DESCRIPTION, + '', + 'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining', + '', + 'Build from:', + ...HOSTS.map(host => ` - ${host[0]}`), + ...DOMAIN_LISTS.map(domainList => ` - ${domainList[0]}`), + ...ADGUARD_FILTERS.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`) + ], new Date(), - span.traceChildSync('sort reject domainset', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)), + span.traceChildSync('sort reject domainset (base)', () => sortDomains(dudupedDominArray, domainArrayMainDomainMap, domainArraySubdomainMap)), 'domainset', path.resolve(import.meta.dir, '../List/domainset/reject.conf'), path.resolve(import.meta.dir, '../Clash/domainset/reject.txt') ), + createRuleset( + span, + 'Sukka\'s Ruleset - Reject Extra', + [ + ...SHARED_DESCRIPTION, + '', + 'The domainset supports AD blocking, tracking protection, privacy protection, anti-phishing, anti-mining', + '', + 'Build from:', + ...HOSTS_EXTRA.map(host => ` - ${host[0]}`), + ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`), + ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`), + ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`) + ], + new Date(), + span.traceChildSync('sort reject domainset (extra)', () => sortDomains(dudupedDominArrayExtra, domainArrayMainDomainMap, domainArraySubdomainMap)), + 'domainset', + path.resolve(import.meta.dir, '../List/domainset/reject_extra.conf'), + path.resolve(import.meta.dir, '../Clash/domainset/reject_extra.txt') + ), compareAndWriteFile( span, rejectDomainsStats.map(([domain, count]) => `${domain}${' '.repeat(100 - domain.length)}${count}`), diff --git a/Build/constants/reject-data-source.ts b/Build/constants/reject-data-source.ts index 83a2e85b..b17cae00 100644 --- a/Build/constants/reject-data-source.ts +++ b/Build/constants/reject-data-source.ts @@ -9,29 +9,28 @@ export const HOSTS: HostsSource[] = [ true, TTL.THREE_HOURS() ], - // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller - ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()], + // no coin list is not actively maintained, but it updates daily when being maintained, so we set a 3 days cache ttl ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', null, true, TTL.THREE_DAYS()], // have not been updated for more than a year, so we set a 14 days cache ttl ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', null, true, TTL.TWO_WEEKS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Xiaomi-Extension.txt', null, false, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/jerryn70/GoodbyeAds/master/Extension/GoodbyeAds-Huawei-AdBlock.txt', null, false, TTL.THREE_DAYS()], - // ad-wars is not actively maintained, so we set a 7 days cache ttl - ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()], ['https://raw.githubusercontent.com/durablenapkin/block/master/luminati.txt', null, true, TTL.THREE_HOURS()] ] as const; +export const HOSTS_EXTRA: HostsSource[] = [ + // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller + ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()], + // ad-wars is not actively maintained, so we set a 7 days cache ttl + ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.ONE_WEEK()] +]; + export const DOMAIN_LISTS: HostsSource[] = [ // CoinBlockerList // Although the hosts file is still actively maintained, the hosts_browser file is not updated since 2021-07, so we set a 14 days cache ttl ['https://zerodot1.gitlab.io/CoinBlockerLists/list_browser.txt', [], true, TTL.TWO_WEEKS()], - // BarbBlock - // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl - ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()], - // DigitalSide Threat-Intel - OSINT Hub - // Update once per day - ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()], + // Curben's PUP Domains Blocklist // 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt' // 'https://pup-filter.pages.dev/pup-filter-agh.txt' @@ -52,7 +51,16 @@ export const DOMAIN_LISTS: HostsSource[] = [ 'https://malware-filter.gitlab.io/malware-filter/urlhaus-filter-domains.txt' ], true, TTL.THREE_HOURS() - ], + ] +] as const; + +export const DOMAIN_LISTS_EXTRA: HostsSource[] = [ + // BarbBlock + // The barbblock list has never been updated since 2019-05, so we set a 14 days cache ttl + ['https://paulgb.github.io/BarbBlock/blacklists/domain-list.txt', [], true, TTL.TWO_WEEKS()], + // DigitalSide Threat-Intel - OSINT Hub + // Update once per day + ['https://osint.digitalside.it/Threat-Intel/lists/latestdomains.txt', [], true, TTL.ONE_DAY()], // AdGuard CNAME Filter Combined // Update on a 7 days basis, so we add a 3 hours cache ttl ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_ads_justdomains.txt', [], true, TTL.THREE_DAYS()], @@ -60,9 +68,9 @@ export const DOMAIN_LISTS: HostsSource[] = [ ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_clickthroughs_justdomains.txt', [], true, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_microsites_justdomains.txt', [], true, TTL.THREE_DAYS()], ['https://raw.githubusercontent.com/AdguardTeam/cname-trackers/master/data/combined_disguised_mail_trackers_justdomains.txt', [], true, TTL.THREE_DAYS()] -] as const; +]; -export const PHISHING_DOMAIN_LISTS: [HostsSource, HostsSource] = [ +export const PHISHING_DOMAIN_LISTS_EXTRA: [HostsSource, HostsSource] = [ [ 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', [ @@ -114,6 +122,46 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ ], TTL.TWLVE_HOURS() ], + // AdGuard Base Filter + ['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()], + // AdGuard Mobile AD + ['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()], + // AdGuard Tracking Protection + ['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()], + // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter) + ['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()], + // AdGuard Annoyances filter + ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()], + // GameConsoleAdblockList + // Update almost once per 1 to 3 months, let's set a 10 days cache ttl + ['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()], + // PiHoleBlocklist + // Update almost once per 3 months, let's set a 10 days cache ttl + [ + 'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt', + [ + 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt' + ], + TTL.TEN_DAYS() + ], + // Spam404 + // Not actively maintained, let's use a 10 days cache ttl + ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()], + // Brave First Party & First Party CNAME + ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()] +] as const; + +export const ADGUARD_FILTERS_EXTRA: AdGuardFilterSource[] = [ + // EasyList Germany filter + [ + 'https://easylist.to/easylistgermany/easylistgermany.txt', + [ + 'https://easylist-downloads.adblockplus.org/easylistgermany.txt' + ], + TTL.TWLVE_HOURS() + ], + // AdGuard Japanese filter + ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()], // uBlock Origin Filter List [ 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.min.txt', @@ -152,45 +200,8 @@ export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ 'https://ublockorigin.pages.dev/filters/unbreak.min.txt' ], TTL.THREE_HOURS() - ], - // AdGuard Base Filter - ['https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', null, TTL.THREE_HOURS()], - // AdGuard Mobile AD - ['https://filters.adtidy.org/extension/ublock/filters/11_optimized.txt', null, TTL.THREE_HOURS()], - // AdGuard Tracking Protection - ['https://filters.adtidy.org/extension/ublock/filters/3_optimized.txt', null, TTL.THREE_HOURS()], - // AdGuard Japanese filter - ['https://filters.adtidy.org/extension/ublock/filters/7_optimized.txt', null, TTL.THREE_HOURS()], - // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter) - ['https://filters.adtidy.org/extension/ublock/filters/224_optimized.txt', null, TTL.THREE_HOURS()], - // AdGuard Annoyances filter - ['https://filters.adtidy.org/android/filters/14_optimized.txt', null, TTL.THREE_HOURS()], - // EasyList Germany filter - [ - 'https://easylist.to/easylistgermany/easylistgermany.txt', - [ - 'https://easylist-downloads.adblockplus.org/easylistgermany.txt' - ], - TTL.TWLVE_HOURS() - ], - // GameConsoleAdblockList - // Update almost once per 1 to 3 months, let's set a 10 days cache ttl - ['https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', null, TTL.TEN_DAYS()], - // PiHoleBlocklist - // Update almost once per 3 months, let's set a 10 days cache ttl - [ - 'https://perflyst.github.io/PiHoleBlocklist/SmartTV-AGH.txt', - [ - 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt' - ], - TTL.TEN_DAYS() - ], - // Spam404 - // Not actively maintained, let's use a 10 days cache ttl - ['https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt', null, TTL.TEN_DAYS()], - // Brave First Party & First Party CNAME - ['https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt', null, TTL.ONE_DAY()] -] as const; + ] +]; export const PREDEFINED_WHITELIST = [ '.localhost', diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index f246e6ec..d8b57f24 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -3,7 +3,7 @@ import * as tldts from 'tldts-experimental'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; -import { PHISHING_DOMAIN_LISTS } from '../constants/reject-data-source'; +import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source'; import { looseTldtsOpt } from '../constants/loose-tldts-opt'; import picocolors from 'picocolors'; import createKeywordFilter from './aho-corasick'; @@ -133,7 +133,7 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g const domainArr = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => { const domainArr: string[] = []; - (await Promise.all(PHISHING_DOMAIN_LISTS.map(entry => processDomainLists(curSpan, ...entry)))) + (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry)))) .forEach(appendArrayInPlaceCurried(domainArr)); return domainArr;