diff --git a/.gitignore b/.gitignore index ec3dc650..edc9638a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ public List/domainset/reject.conf List/domainset/reject_phishing.conf +List/domainset/reject_sukka.conf List/domainset/apple_cdn.conf List/non_ip/cdn.conf List/non_ip/apple_cdn.conf diff --git a/Build/build-apple-cdn.js b/Build/build-apple-cdn.js index 0e472a23..395383d1 100644 --- a/Build/build-apple-cdn.js +++ b/Build/build-apple-cdn.js @@ -2,7 +2,7 @@ const { fetchWithRetry } = require('./lib/fetch-retry'); const fs = require('fs'); const path = require('path'); -const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?([a-z0-9\-]{1,61}|[a-z0-9\-]{1,30})\.[a-z]{2,}$/m; +const { isDomainLoose } = require('./lib/is-domain-loose'); (async () => { console.time('Total Time - build-apple-cdn-conf'); @@ -16,7 +16,7 @@ const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?( return null }) - .filter(domain => typeof domain === 'string' && rDomain.test(domain)); + .filter(domain => typeof domain === 'string' && isDomainLoose(domain)); await Promise.all([ fs.promises.writeFile( diff --git a/Build/build-reject-domainset.js b/Build/build-reject-domainset.js index c435be26..be995445 100644 --- a/Build/build-reject-domainset.js +++ b/Build/build-reject-domainset.js @@ -7,6 +7,10 @@ const cpuCount = require('os').cpus().length; const { isCI } = require('ci-info'); const threads = isCI ? cpuCount : cpuCount / 2; +const { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST } = require('./lib/reject-data-source'); + +const filterRuleWhitelistDomainSets = new Set(PREDEFINED_WHITELIST); + (async () => { console.time('Total Time - build-reject-domain-set'); @@ -17,11 +21,9 @@ const threads = isCI ? cpuCount : cpuCount / 2; console.time('* Download and process Hosts'); // Parse from remote hosts & domain lists - (await Promise.all([ - processHosts('https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true), - processHosts('https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt'), - processHosts('https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt') - ])).forEach(hosts => { + (await Promise.all( + HOSTS.map(entry => processHosts(entry[0], entry[1])) + )).forEach(hosts => { hosts.forEach(host => { if (host) { domainSets.add(host); @@ -34,195 +36,9 @@ const threads = isCI ? cpuCount : cpuCount / 2; let previousSize = domainSets.size; console.log(`Import ${previousSize} rules from hosts files!`); - await fsPromises.readFile(pathResolve(__dirname, '../Source/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => { - data.split('\n').forEach(line => { - const trimmed = line.trim(); - if ( - line.startsWith('#') - || line.startsWith(' ') - || line.startsWith('\r') - || line.startsWith('\n') - || trimmed === '' - ) { - return; - } - - /* if (domainSets.has(line) || domainSets.has(`.${line}`)) { - console.warn(`|${line}| is already in the list!`); - } */ - domainSets.add(trimmed); - }); - }); - - // Copy reject_sukka.conf for backward compatibility - await fse.copy(pathResolve(__dirname, '../Source/domainset/reject_sukka.conf'), pathResolve(__dirname, '../List/domainset/reject_sukka.conf')) - - previousSize = domainSets.size - previousSize; - console.log(`Import ${previousSize} rules from reject_sukka.conf!`); - // Parse from AdGuard Filters - /** @type Set */ - const filterRuleWhitelistDomainSets = new Set([ - 'localhost', - 'broadcasthost', - 'ip6-loopback', - 'ip6-localnet', - 'ip6-mcastprefix', - 'ip6-allnodes', - 'ip6-allrouters', - 'ip6-allhosts', - 'mcastprefix', - 'skk.moe', - 'analytics.google.com', - 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS - 'cloud.answerhub.com', - 'ae01.alicdn.com', - 'whoami.akamai.net', - 'whoami.ds.akahelp.net', - 'pxlk9.net.', // This one is malformed from EasyList, which I will manually add instead - 'instant.page', // No, it doesn't violate anyone's privacy. I will whitelist it - 'piwik.pro', - 'mixpanel.com', - 'cdn.mxpnl.com', - 'heapanalytics.com', - 'segment.com', - 'segmentify.com', - 't.co', // pgl yoyo add t.co to the blacklist - 'survicate.com', // AdGuardDNSFilter - 'perfops.io', // AdGuardDNSFilter - 'd2axgrpnciinw7.cloudfront.net', // ADGuardDNSFilter - 'tb-lb.sb-cd.com', // AdGuard - 'storage.yandexcloud.net', // phishing list - 'login.microsoftonline.com' // phishing list - ]); - console.time('* Download and process AdBlock Filter Rules'); - (await Promise.all([ - // Easy List - [ - 'https://easylist.to/easylist/easylist.txt', - [ - 'https://easylist-downloads.adblockplus.org/easylist.txt', - 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt', - 'https://secure.fanboy.co.nz/easylist.txt' - ] - ], - // AdGuard DNS Filter - 'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt', - // uBlock Origin Filter List - [ - 'https://ublockorigin.github.io/uAssets/filters/filters.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt', - 'https://ublockorigin.pages.dev/filters/filters.txt' - ] - ], - [ - 'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt', - 'https://ublockorigin.pages.dev/filters/filters-2020.txt' - ] - ], - [ - 'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt', - 'https://ublockorigin.pages.dev/filters/filters-2021.txt' - ] - ], - [ - 'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt', - 'https://ublockorigin.pages.dev/filters/filters-2022.txt' - ] - ], - // uBlock Origin Badware Risk List - [ - 'https://ublockorigin.github.io/uAssets/filters/badware.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt', - 'https://ublockorigin.pages.dev/filters/badware.txt' - ] - ], - // uBlock Origin Privacy List - [ - 'https://ublockorigin.github.io/uAssets/filters/privacy.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt', - 'https://ublockorigin.pages.dev/filters/privacy.txt' - ] - ], - // uBlock Origin Resource Abuse - [ - 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt', - 'https://ublockorigin.pages.dev/filters/resource-abuse.txt' - ] - ], - // uBlock Origin Unbreak - [ - 'https://ublockorigin.github.io/uAssets/filters/unbreak.txt', - [ - 'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt', - 'https://ublockorigin.pages.dev/filters/unbreak.txt' - ] - ], - // AdGuard Base Filter - 'https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', - // AdGuard Mobile AD - 'https://filters.adtidy.org/extension/ublock/filters/11.txt', - // AdGuard Tracking Protection - 'https://filters.adtidy.org/extension/ublock/filters/3.txt', - // AdGuard Japanese filter - 'https://filters.adtidy.org/extension/ublock/filters/7.txt', - // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter) - 'https://filters.adtidy.org/extension/ublock/filters/224.txt', - // Easy Privacy - [ - 'https://easylist.to/easylist/easyprivacy.txt', - [ - 'https://secure.fanboy.co.nz/easyprivacy.txt', - 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt', - 'https://easylist-downloads.adblockplus.org/easyprivacy.txt' - ] - ], - // Curben's UrlHaus Malicious URL Blocklist - [ - 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', - [ - 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', - // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while - // 'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-agh-online.txt' - ] - ], - // Curben's Phishing URL Blocklist - [ - 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt', - [ - 'https://phishing-filter.pages.dev/phishing-filter-agh.txt', - // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while - // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt' - ] - ], - // Curben's PUP Domains Blocklist - [ - 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt', - [ - 'https://pup-filter.pages.dev/pup-filter-agh.txt', - // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while - // 'https://malware-filter.gitlab.io/malware-filter/pup-filter-agh.txt' - ] - ], - // GameConsoleAdblockList - 'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', - // PiHoleBlocklist - 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt', - // Spam404 - 'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt' - ].map(input => { + (await Promise.all(ADGUARD_FILTERS.map(input => { if (typeof input === 'string') { return processFilterRules(input); } @@ -239,9 +55,33 @@ const threads = isCI ? cpuCount : cpuCount / 2; previousSize = domainSets.size - previousSize; console.log(`Import ${previousSize} rules from adguard filters!`); + await fsPromises.readFile(pathResolve(__dirname, '../Source/domainset/reject_sukka.conf'), { encoding: 'utf-8' }).then(data => { + data.split('\n').forEach(line => { + const trimmed = line.trim(); + if ( + line.startsWith('#') + || line.startsWith(' ') + || line.startsWith('\r') + || line.startsWith('\n') + || trimmed === '' + ) { + return; + } + + domainSets.add(trimmed); + }); + }); + + // Copy reject_sukka.conf for backward compatibility + await fse.copy(pathResolve(__dirname, '../Source/domainset/reject_sukka.conf'), pathResolve(__dirname, '../List/domainset/reject_sukka.conf')) + + previousSize = domainSets.size - previousSize; + console.log(`Import ${previousSize} rules from reject_sukka.conf!`); + // Read DOMAIN Keyword const domainKeywordsSet = new Set(); const domainSuffixSet = new Set(); + await fsPromises.readFile(pathResolve(__dirname, '../List/non_ip/reject.conf'), { encoding: 'utf-8' }).then(data => { data.split('\n').forEach(line => { if (line.startsWith('DOMAIN-KEYWORD')) { @@ -268,9 +108,6 @@ const threads = isCI ? cpuCount : cpuCount / 2; return; } - /* if (domainSets.has(line) || domainSets.has(`.${line}`)) { - console.warn(`|${line}| is already in the list!`); - } */ domainSuffixSet.add(trimmed); }); }); @@ -302,11 +139,8 @@ const threads = isCI ? cpuCount : cpuCount / 2; } if (!isTobeRemoved) { - for (const white of filterRuleWhitelistDomainSets) { - if (domain.includes(white) || white.includes(domain)) { - isTobeRemoved = true; - break; - } + if (isInWhiteList(domain)) { + isTobeRemoved = true; } } @@ -380,3 +214,13 @@ const threads = isCI ? cpuCount : cpuCount / 2; process.exit(0); } })(); + +function isInWhiteList (domain) { + for (const white of filterRuleWhitelistDomainSets) { + if (domain === white || domain.endsWith(white)) { + return true; + } + } + + return false; +} diff --git a/Build/lib/is-domain-loose.js b/Build/lib/is-domain-loose.js new file mode 100644 index 00000000..381fd09d --- /dev/null +++ b/Build/lib/is-domain-loose.js @@ -0,0 +1,3 @@ +const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?([a-z0-9\-]{1,61}|[a-z0-9\-]{1,30})\.[a-z]{2,}$/m; + +module.exports.isDomainLoose = (domain) => rDomain.test(domain); diff --git a/Build/lib/parse-filter.js b/Build/lib/parse-filter.js index 98d7e3da..be388ce7 100644 --- a/Build/lib/parse-filter.js +++ b/Build/lib/parse-filter.js @@ -1,7 +1,6 @@ const { isIP } = require('net'); const { fetchWithRetry } = require('./fetch-retry'); - -const rDomain = /^(((?!\-))(xn\-\-)?[a-z0-9\-_]{0,61}[a-z0-9]{1,1}\.)*(xn\-\-)?([a-z0-9\-]{1,61}|[a-z0-9\-]{1,30})\.[a-z]{2,}$/m +const { isDomainLoose } = require('./is-domain-loose'); const DEBUG_DOMAIN_TO_FIND = null; // example.com | null @@ -80,7 +79,7 @@ async function processHosts (hostsUrl, includeAllSubDomain = false) { warnOnce(hostsUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); } - if (rDomain.test(domain)) { + if (isDomainLoose(domain)) { if (includeAllSubDomain) { domainSets.add(`.${domain}`); } else { @@ -146,12 +145,14 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) { if (lineStartsWithDoubleVerticalBar && line.endsWith('^$badfilter')) { const domain = line.replace('||', '').replace('^$badfilter', '').trim(); - if (rDomain.test(domain)) { + if (isDomainLoose(domain)) { if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) { warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND); } whitelistDomainSets.add(domain); + } else { + console.warn(' * [parse-filter white] ' + domain + ' is not a valid domain'); } } else if (line.startsWith('@@||') && ( @@ -168,12 +169,14 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) { .replaceAll('^|', '') .replaceAll('^', '') .trim(); - if (rDomain.test(domain)) { + if (isDomainLoose(domain)) { if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) { warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND); } whitelistDomainSets.add(domain); + } else { + console.warn(' * [parse-filter white] ' + domain + ' is not a valid domain'); } } else if ( lineStartsWithDoubleVerticalBar @@ -189,8 +192,8 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) { .replaceAll('^$all', '') .replaceAll('^', '') .trim(); - if (rDomain.test(domain)) { + if (isDomainLoose(domain)) { if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) { warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); } @@ -205,8 +208,7 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) { ) ) { const domain = `${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim(); - if (rDomain.test(domain)) { - + if (isDomainLoose(domain)) { if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) { warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND); } diff --git a/Build/lib/reject-data-source.js b/Build/lib/reject-data-source.js new file mode 100644 index 00000000..76c70a51 --- /dev/null +++ b/Build/lib/reject-data-source.js @@ -0,0 +1,171 @@ +/** @type {[string, boolean][]} */ +const HOSTS = [ + ['https://pgl.yoyo.org/adservers/serverlist.php?hostformat=hosts&showintro=0&mimetype=plaintext', true], + ['https://raw.githubusercontent.com/hoshsadiq/adblock-nocoin-list/master/hosts.txt', false], + ['https://raw.githubusercontent.com/crazy-max/WindowsSpyBlocker/master/data/hosts/spy.txt', false] +] + +const ADGUARD_FILTERS = [ + // Easy List + [ + 'https://easylist.to/easylist/easylist.txt', + [ + 'https://easylist-downloads.adblockplus.org/easylist.txt', + 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt', + 'https://secure.fanboy.co.nz/easylist.txt' + ] + ], + // AdGuard DNS Filter + 'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt', + // uBlock Origin Filter List + [ + 'https://ublockorigin.github.io/uAssets/filters/filters.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt', + 'https://ublockorigin.pages.dev/filters/filters.txt' + ] + ], + [ + 'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt', + 'https://ublockorigin.pages.dev/filters/filters-2020.txt' + ] + ], + [ + 'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt', + 'https://ublockorigin.pages.dev/filters/filters-2021.txt' + ] + ], + [ + 'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt', + 'https://ublockorigin.pages.dev/filters/filters-2022.txt' + ] + ], + // uBlock Origin Badware Risk List + [ + 'https://ublockorigin.github.io/uAssets/filters/badware.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt', + 'https://ublockorigin.pages.dev/filters/badware.txt' + ] + ], + // uBlock Origin Privacy List + [ + 'https://ublockorigin.github.io/uAssets/filters/privacy.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt', + 'https://ublockorigin.pages.dev/filters/privacy.txt' + ] + ], + // uBlock Origin Resource Abuse + [ + 'https://ublockorigin.github.io/uAssets/filters/resource-abuse.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt', + 'https://ublockorigin.pages.dev/filters/resource-abuse.txt' + ] + ], + // uBlock Origin Unbreak + [ + 'https://ublockorigin.github.io/uAssets/filters/unbreak.txt', + [ + 'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt', + 'https://ublockorigin.pages.dev/filters/unbreak.txt' + ] + ], + // AdGuard Base Filter + 'https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt', + // AdGuard Mobile AD + 'https://filters.adtidy.org/extension/ublock/filters/11.txt', + // AdGuard Tracking Protection + 'https://filters.adtidy.org/extension/ublock/filters/3.txt', + // AdGuard Japanese filter + 'https://filters.adtidy.org/extension/ublock/filters/7.txt', + // AdGuard Chinese filter (EasyList China + AdGuard Chinese filter) + 'https://filters.adtidy.org/extension/ublock/filters/224.txt', + // Easy Privacy + [ + 'https://easylist.to/easylist/easyprivacy.txt', + [ + 'https://secure.fanboy.co.nz/easyprivacy.txt', + 'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt', + 'https://easylist-downloads.adblockplus.org/easyprivacy.txt' + ] + ], + // Curben's UrlHaus Malicious URL Blocklist + [ + 'https://curbengh.github.io/urlhaus-filter/urlhaus-filter-agh-online.txt', + [ + 'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt', + // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while + // 'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-agh-online.txt' + ] + ], + // Curben's Phishing URL Blocklist + [ + 'https://curbengh.github.io/phishing-filter/phishing-filter-agh.txt', + [ + 'https://phishing-filter.pages.dev/phishing-filter-agh.txt', + // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while + // 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt' + ] + ], + // Curben's PUP Domains Blocklist + [ + 'https://curbengh.github.io/pup-filter/pup-filter-agh.txt', + [ + 'https://pup-filter.pages.dev/pup-filter-agh.txt', + // Prefer mirror, since malware-filter.gitlab.io has not been updated for a while + // 'https://malware-filter.gitlab.io/malware-filter/pup-filter-agh.txt' + ] + ], + // GameConsoleAdblockList + 'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt', + // PiHoleBlocklist + 'https://raw.githubusercontent.com/Perflyst/PiHoleBlocklist/master/SmartTV-AGH.txt', + // Spam404 + 'https://raw.githubusercontent.com/Spam404/lists/master/adblock-list.txt' +]; + +const PREDEFINED_WHITELIST = [ + 'localhost', + 'broadcasthost', + 'ip6-loopback', + 'ip6-localnet', + 'ip6-mcastprefix', + 'ip6-allnodes', + 'ip6-allrouters', + 'ip6-allhosts', + 'mcastprefix', + 'skk.moe', + 'analytics.google.com', + 'msa.cdn.mediaset.net', // Added manually using DOMAIN-KEYWORDS + 'cloud.answerhub.com', + 'ae01.alicdn.com', + 'whoami.akamai.net', + 'whoami.ds.akahelp.net', + 'pxlk9.net.', // This one is malformed from EasyList, which I will manually add instead + 'instant.page', // No, it doesn't violate anyone's privacy. I will whitelist it + 'piwik.pro', + 'mixpanel.com', + 'cdn.mxpnl.com', + 'heapanalytics.com', + 'segment.com', + 'segmentify.com', + 't.co', // pgl yoyo add t.co to the blacklist + 'survicate.com', // AdGuardDNSFilter + 'perfops.io', // AdGuardDNSFilter + 'd2axgrpnciinw7.cloudfront.net', // ADGuardDNSFilter + 'tb-lb.sb-cd.com', // AdGuard + 'storage.yandexcloud.net', // phishing list + 'login.microsoftonline.com' // phishing list +]; + +module.exports.HOSTS = HOSTS; +module.exports.ADGUARD_FILTERS = ADGUARD_FILTERS; +module.exports.PREDEFINED_WHITELIST = PREDEFINED_WHITELIST;