Update Reject Filter Parsing

This commit is contained in:
SukkaW
2022-12-26 00:12:59 +08:00
parent 14494a0919
commit 44aeb217d8
8 changed files with 366 additions and 92 deletions

View File

@@ -1,4 +1,5 @@
const { fetchWithRetry } = require('./fetch-retry');
const { NetworkFilter } = require('@cliqz/adblocker');
const { normalizeDomain } = require('./is-domain-loose');
const DEBUG_DOMAIN_TO_FIND = null; // example.com | null
@@ -99,9 +100,9 @@ async function processHosts (hostsUrl, includeAllSubDomain = false) {
/**
* @param {string | URL} filterRulesUrl
* @param {(string | URL)[] | undefined} fallbackUrls
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean }>}
* @returns {Promise<{ white: Set<string>, black: Set<string>, foundDebugDomain: boolean, parseFailed: boolean }>}
*/
async function processFilterRules (filterRulesUrl, fallbackUrls) {
async function processFilterRules (filterRulesUrl, fallbackUrls, includeThirdParties = false) {
console.time(` - processFilterRules: ${filterRulesUrl}`);
/** @type Set<string> */
@@ -109,6 +110,26 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) {
/** @type Set<string> */
const blacklistDomainSets = new Set();
const addToBlackList = (domainToBeAddedToBlack, isSubDomain) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToBlack.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
if (isSubDomain && !domainToBeAddedToBlack.startsWith('.')) {
blacklistDomainSets.add(`.${domainToBeAddedToBlack}`);
} else {
blacklistDomainSets.add(domainToBeAddedToBlack);
}
};
const addToWhiteList = (domainToBeAddedToWhite) => {
if (DEBUG_DOMAIN_TO_FIND && domainToBeAddedToWhite.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
whitelistDomainSets.add(domainToBeAddedToWhite);
}
let filterRules;
try {
/** @type string[] */
@@ -124,117 +145,236 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) {
throw e;
}
for (let i = 0, len = filterRules.length; i < len; i++) {
const line = filterRules[i];
let hasParseFailed = false;
const lineStartsWithDoubleVerticalBar = line.startsWith('||');
for (let i = 0, len = filterRules.length; i < len; i++) {
const line = filterRules[i].trim();
if (
line === ''
// doesn't include
|| !line.includes('.') // rule with out dot can not be a domain
// includes
|| line.includes('#')
|| line.includes('!')
|| line.includes('?')
|| line.includes('*')
|| line.includes('/')
|| line.includes('=')
|| line.includes('[')
|| line.includes('(')
|| line.includes('$') && !lineStartsWithDoubleVerticalBar
|| line.includes(']')
|| line.includes(')')
|| line.includes(',')
|| line.includes('~')
|| line.includes('&')
|| line.includes('%')
|| ((line.includes('/') || line.includes(':')) && !line.includes('://'))
// ends with
|| line.endsWith('.')
|| line.endsWith('-')
|| line.endsWith('_')
// special modifier
|| line.includes('$popup')
|| line.includes('$removeparam')
|| line.includes('$popunder')
) {
continue;
}
const filter = NetworkFilter.parse(line);
if (filter) {
if (
filter.isElemHide()
|| filter.isGenericHide()
|| filter.isSpecificHide()
|| filter.isRedirect()
|| filter.isRedirectRule()
|| filter.hasDomains()
|| filter.isCSP() // must not be csp rule
|| (!filter.fromAny() && !filter.fromDocument())
) {
// not supported type
continue;
}
if (
filter.hasHostname() // must have
&& filter.isPlain()
&& (!filter.isRegex())
&& (!filter.isFullRegex())
) {
const hostname = filter.getHostname();
if (hostname) {
if (filter.isException() || filter.isBadFilter()) {
addToWhiteList(hostname);
continue;
}
if (filter.firstParty() === filter.thirdParty()) {
addToBlackList(hostname, true);
continue;
}
if (filter.thirdParty()) {
if (includeThirdParties) {
addToBlackList(hostname, true);
}
continue;
}
if (filter.firstParty()) {
continue;
}
}
}
}
if (line.includes('$third-party') || line.includes('$frame')) {
continue;
}
const lineEndsWithCaret = line.endsWith('^');
const lineEndsWithCaretVerticalBar = line.endsWith('^|');
if (lineStartsWithDoubleVerticalBar && line.endsWith('^$badfilter')) {
const _domain = line.replace('||', '').replace('^$badfilter', '').trim();
const domain = normalizeDomain(_domain);
if (domain) {
if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
if (line.startsWith('@@')) {
if (line.endsWith('$cname')) {
continue;
}
if (
(line.startsWith('@@|') || line.startsWith('@@.'))
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('$genericblock')
|| line.endsWith('$document')
)
) {
const _domain = line
.replace('@@||', '')
.replace('@@|', '')
.replace('@@.', '')
.replace('^|', '')
.replace('^$genericblock', '')
.replace('$genericblock', '')
.replace('^$document', '')
.replace('$document', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToWhiteList(domain);
} else {
console.warn(' * [parse-filter E0001] (black) invalid domain:', _domain);
}
whitelistDomainSets.add(domain);
} else {
console.warn(' * [parse-filter white] ' + _domain + ' is not a valid domain');
continue;
}
} else if (line.startsWith('@@||')
}
if (
line.startsWith('||')
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('^$badfilter')
|| line.endsWith('^$1p')
|| line.endsWith('$cname')
)
) {
const _domain = line
.replaceAll('@@||', '')
.replaceAll('^$badfilter', '')
.replaceAll('^$1p', '')
.replaceAll('^|', '')
.replace('||', '')
.replace('^|', '')
.replace('$cname', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), true, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
whitelistDomainSets.add(domain);
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter white] ' + _domain + ' is not a valid domain');
console.warn(' * [parse-filter E0002] (black) invalid domain:', _domain);
}
} else if (
lineStartsWithDoubleVerticalBar
continue;
}
const lineStartsWithSingleDot = line.startsWith('.');
if (
lineStartsWithSingleDot
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
|| line.endsWith('^$all')
|| line.endsWith('^$doc')
|| line.endsWith('^$document')
)
) {
const _domain = line
.replaceAll('||', '')
.replaceAll('^|', '')
.replaceAll('^$all', '')
.replaceAll('^$document', '')
.replaceAll('^$doc', '')
.replace('^|', '')
.replaceAll('^', '')
.slice(1)
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
addToBlackList(domain, true);
} else {
console.warn(' * [parse-filter E0003] (black) invalid domain:', _domain);
}
continue;
}
if (
(
line.startsWith('://')
|| line.startsWith('http://')
|| line.startsWith('https://')
|| line.startsWith('|http://')
|| line.startsWith('|https://')
)
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = line
.replace('|https://', '')
.replace('https://', '')
.replace('|http://', '')
.replace('http://', '')
.replace('://', '')
.replace('^|', '')
.replaceAll('^', '')
.trim();
const domain = normalizeDomain(_domain);
if (domain) {
if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
blacklistDomainSets.add(`.${domain}`);
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0004] (black) invalid domain:', _domain);
}
} else if (
line.startsWith('://')
&& (
lineEndsWithCaret
|| lineEndsWithCaretVerticalBar
)
) {
const _domain = `${line.replaceAll('://', '').replaceAll('^|', '').replaceAll('^', '')}`.trim();
continue;
}
if (!line.startsWith('|') && lineEndsWithCaret) {
const _domain = line.slice(0, -1);
const domain = normalizeDomain(_domain);
if (domain) {
if (DEBUG_DOMAIN_TO_FIND && domain.includes(DEBUG_DOMAIN_TO_FIND)) {
warnOnce(filterRulesUrl.toString(), false, DEBUG_DOMAIN_TO_FIND);
foundDebugDomain = true;
}
blacklistDomainSets.add(domain);
addToBlackList(domain, false);
} else {
console.warn(' * [parse-filter E0005] (black) invalid domain:', _domain);
}
continue;
}
const tryNormalizeDomain = normalizeDomain(lineStartsWithSingleDot ? line.slice(1) : line);
if (
tryNormalizeDomain
&& (
lineStartsWithSingleDot
? tryNormalizeDomain.length === line.length - 1
: tryNormalizeDomain === line
)
) {
addToBlackList(line, true);
continue;
}
if (
!line.endsWith('.js')
) {
hasParseFailed = true;
console.warn(' * [parse-filter E0010] can not parse:', line);
}
}
@@ -243,7 +383,8 @@ async function processFilterRules (filterRulesUrl, fallbackUrls) {
return {
white: whitelistDomainSets,
black: blacklistDomainSets,
foundDebugDomain
foundDebugDomain,
parseFailed: hasParseFailed
};
}

View File

@@ -15,7 +15,18 @@ const ADGUARD_FILTERS = [
'https://easylist-downloads.adblockplus.org/easylist.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easylist.txt',
'https://secure.fanboy.co.nz/easylist.txt'
]
],
true
],
// Easy Privacy
[
'https://easylist.to/easylist/easyprivacy.txt',
[
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt'
],
true
],
// AdGuard DNS Filter
'https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt',
@@ -27,28 +38,32 @@ const ADGUARD_FILTERS = [
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters.txt',
'https://ublockorigin.pages.dev/filters/filters.txt'
]
],
false
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2020.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2020.txt',
'https://ublockorigin.pages.dev/filters/filters-2020.txt'
]
],
false
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2021.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2021.txt',
'https://ublockorigin.pages.dev/filters/filters-2021.txt'
]
],
false
],
[
'https://ublockorigin.github.io/uAssets/filters/filters-2022.txt',
[
'https://ublockorigin.github.io/uAssetsCDN/filters/filters-2022.txt',
'https://ublockorigin.pages.dev/filters/filters-2022.txt'
]
],
false
],
// uBlock Origin Badware Risk List
[
@@ -56,7 +71,8 @@ const ADGUARD_FILTERS = [
[
'https://ublockorigin.github.io/uAssetsCDN/filters/badware.txt',
'https://ublockorigin.pages.dev/filters/badware.txt'
]
],
false
],
// uBlock Origin Privacy List
[
@@ -64,7 +80,8 @@ const ADGUARD_FILTERS = [
[
'https://ublockorigin.github.io/uAssetsCDN/filters/privacy.txt',
'https://ublockorigin.pages.dev/filters/privacy.txt'
]
],
false
],
// uBlock Origin Resource Abuse
[
@@ -72,7 +89,8 @@ const ADGUARD_FILTERS = [
[
'https://ublockorigin.github.io/uAssetsCDN/filters/resource-abuse.txt',
'https://ublockorigin.pages.dev/filters/resource-abuse.txt'
]
],
false
],
// uBlock Origin Unbreak
[
@@ -80,7 +98,8 @@ const ADGUARD_FILTERS = [
[
'https://ublockorigin.github.io/uAssetsCDN/filters/unbreak.txt',
'https://ublockorigin.pages.dev/filters/unbreak.txt'
]
],
false
],
// AdGuard Base Filter
'https://filters.adtidy.org/extension/ublock/filters/2_without_easylist.txt',
@@ -99,16 +118,8 @@ const ADGUARD_FILTERS = [
'https://easylist.to/easylistgermany/easylistgermany.txt',
[
'https://easylist-downloads.adblockplus.org/easylistgermany.txt'
]
],
// Easy Privacy
[
'https://easylist.to/easylist/easyprivacy.txt',
[
'https://secure.fanboy.co.nz/easyprivacy.txt',
'https://raw.githubusercontent.com/easylist/easylist/gh-pages/easyprivacy.txt',
'https://easylist-downloads.adblockplus.org/easyprivacy.txt'
]
],
false
],
// Curben's UrlHaus Malicious URL Blocklist
[
@@ -117,7 +128,8 @@ const ADGUARD_FILTERS = [
'https://urlhaus-filter.pages.dev/urlhaus-filter-agh-online.txt',
// Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// 'https://malware-filter.gitlab.io/urlhaus-filter/urlhaus-filter-agh-online.txt'
]
],
false
],
// Curben's Phishing URL Blocklist
[
@@ -126,7 +138,8 @@ const ADGUARD_FILTERS = [
'https://phishing-filter.pages.dev/phishing-filter-agh.txt',
// Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// 'https://malware-filter.gitlab.io/malware-filter/phishing-filter-agh.txt'
]
],
false
],
// Curben's PUP Domains Blocklist
[
@@ -135,7 +148,8 @@ const ADGUARD_FILTERS = [
'https://pup-filter.pages.dev/pup-filter-agh.txt',
// Prefer mirror, since malware-filter.gitlab.io has not been updated for a while
// 'https://malware-filter.gitlab.io/malware-filter/pup-filter-agh.txt'
]
],
false
],
// GameConsoleAdblockList
'https://raw.githubusercontent.com/DandelionSprout/adfilt/master/GameConsoleAdblockList.txt',
@@ -147,7 +161,8 @@ const ADGUARD_FILTERS = [
'https://paulgb.github.io/BarbBlock/blacklists/ublock-origin.txt',
// Brave First Party & First Party CNAME
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty.txt',
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty-cname.txt'
'https://raw.githubusercontent.com/brave/adblock-lists/master/brave-lists/brave-firstparty-cname.txt',
'https://raw.githubusercontent.com/brave/adblock-lists/master/coin-miners.txt'
];
const PREDEFINED_WHITELIST = [