From 2643903b24f2285891c3b2639582737e16eeef4f Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sat, 4 Jan 2025 01:30:35 +0800 Subject: [PATCH] Refactor/Perf: rewrite how rules are salvaged --- Build/lib/parse-filter.ts | 481 ++++++++++++-------------------------- Source/non_ip/reject.conf | 8 + 2 files changed, 159 insertions(+), 330 deletions(-) diff --git a/Build/lib/parse-filter.ts b/Build/lib/parse-filter.ts index 67c24ef5..cf34670c 100644 --- a/Build/lib/parse-filter.ts +++ b/Build/lib/parse-filter.ts @@ -300,7 +300,7 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart return result; } - let line = $line.trim(); + const line = $line.trim(); if (line.length === 0) { result[1] = ParseType.Null; @@ -308,11 +308,14 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart } const firstCharCode = line.charCodeAt(0); - let lastCharCode = line.charCodeAt(line.length - 1); + const lastCharCode = line.charCodeAt(line.length - 1); if ( firstCharCode === 47 // 47 `/` // ends with + // _160-600. + // -detect-adblock. + // _web-advert. || lastCharCode === 46 // 46 `.`, line.endsWith('.') || lastCharCode === 45 // 45 `-`, line.endsWith('-') || lastCharCode === 95 // 95 `_`, line.endsWith('_') @@ -405,263 +408,130 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart } } + /** + * From now on, we are mostly facing non-standard domain rules (some are regex like) + * + * We can still salvage some of them by removing modifiers + */ + + let sliceStart = 0; + let sliceEnd = 0; + // After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter // We now need to "salvage" the line as much as possible + let white = false; + let includeAllSubDomain = false; + + if ( + firstCharCode === 64 // 64 `@` + && line.charCodeAt(1) === 64 // 64 `@` + ) { + sliceStart += 2; + white = true; + includeAllSubDomain = true; + } + + /** + * Some "malformed" regex-based filters can not be parsed by NetworkFilter + * "$genericblock`" is also not supported by NetworkFilter, see: + * https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950 + * + * `@@||cmechina.net^$genericblock` + * `@@|ftp.bmp.ovh^|` + * `@@|adsterra.com^|` + * `@@.atlassian.net$document` + * `@@||ad.alimama.com^$genericblock` + */ + + switch (line.charCodeAt(sliceStart)) { + case 124: /** | */ + // line.startsWith('@@|') || line.startsWith('|') + sliceStart += 1; + includeAllSubDomain = false; + + if (line[sliceStart] === '|') { // line.startsWith('@@||') || line.startsWith('||') + sliceStart += 1; + includeAllSubDomain = true; + } + + break; + + case 46: { /** | */ // line.startsWith('@@.') || line.startsWith('.') + /** + * `.ay.delivery^` + * `.m.bookben.com^` + * `.wap.x4399.com^` + */ + sliceStart += 1; + includeAllSubDomain = true; + break; + } + + default: + break; + } + + switch (line.charCodeAt(sliceStart)) { + case 58: { /** : */ + /** + * `@@://googleadservices.com^|` + * `@@://www.googleadservices.com^|` + * `://mine.torrent.pw^` + * `://say.ac^` + */ + if (line[sliceStart + 1] === '/' && line[sliceStart + 2] === '/') { + includeAllSubDomain = false; + sliceStart += 3; + } + break; + } + + case 104: { /** h */ + /** |http://x.o2.pl^ */ + if (line.startsWith('http://', sliceStart)) { + sliceStart += 7; + } else if (line.startsWith('https://', sliceStart)) { + sliceStart += 8; + } + break; + } + + default: + break; + } + + const indexOfDollar = line.indexOf('$', sliceStart); + if (indexOfDollar > -1) { + sliceEnd = indexOfDollar - line.length; + } + /* - * From now on, we are mostly facing non-standard domain rules (some are regex like) - * We first skip third-party and frame rules, as Surge / Clash can't handle them + * We skip third-party and frame rules, as Surge / Clash can't handle them * * `.sharecounter.$third-party` * `.bbelements.com^$third-party` * `://o0e.ru^$third-party` * `.1.1.1.l80.js^$third-party` */ - if (line.includes('$third-party')) { - if (!allowThirdParty) { - result[1] = ParseType.Null; - return result; - } - - line = line - .replace('$third-party,', '$') - .replace('$third-party', ''); - } - - lastCharCode = line.charCodeAt(line.length - 1); - - /** @example line.endsWith('^') */ - const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^'; - /** @example line.endsWith('|') */ - const lineEndsWithVerticalBar = lastCharCode === 124; // lastChar === '|'; - /** @example line.endsWith('^|') */ - const lineEndsWithCaretVerticalBar = lineEndsWithVerticalBar && line[line.length - 2] === '^'; - /** @example line.endsWith('^') || line.endsWith('^|') */ - const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar; - - // whitelist (exception) if ( - firstCharCode === 64 // 64 `@` - && line[1] === '@' + !allowThirdParty + && ( + line.includes('third-party', indexOfDollar + 1) + || line.includes('3p', indexOfDollar + 1) + ) ) { - let whiteIncludeAllSubDomain = true; - - /** - * Some "malformed" regex-based filters can not be parsed by NetworkFilter - * "$genericblock`" is also not supported by NetworkFilter, see: - * https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950 - * - * `@@||cmechina.net^$genericblock` - * `@@|ftp.bmp.ovh^|` - * `@@|adsterra.com^|` - * `@@.atlassian.net$document` - * `@@||ad.alimama.com^$genericblock` - */ - - let sliceStart = 0; - let sliceEnd: number | undefined; - - switch (line[2]) { - case '|': - // line.startsWith('@@|') - sliceStart = 3; - whiteIncludeAllSubDomain = false; - - if (line[3] === '|') { // line.startsWith('@@||') - sliceStart = 4; - whiteIncludeAllSubDomain = true; - } - - break; - - case '.': { // line.startsWith('@@.') - sliceStart = 3; - whiteIncludeAllSubDomain = true; - break; - } - - case ':': { - /** - * line.startsWith('@@://') - * - * `@@://googleadservices.com^|` - * `@@://www.googleadservices.com^|` - */ - if (line[3] === '/' && line[4] === '/') { - whiteIncludeAllSubDomain = false; - sliceStart = 5; - } - break; - } - - default: - break; - } - - if (lineEndsWithCaret) { - sliceEnd = -1; - } else if (lineEndsWithVerticalBar) { - // It is possible that a whitelist filter ends with '|' without '^|' - // @@|www.auslogics.com| - sliceEnd = lineEndsWithCaretVerticalBar ? -2 : -1; - } else if (line.endsWith('$genericblock')) { - sliceEnd = -13; - if (line[line.length - 14] === '^') { // line.endsWith('^$genericblock') - sliceEnd = -14; - } - } else if (line.endsWith('$document')) { - sliceEnd = -9; - if (line[line.length - 10] === '^') { // line.endsWith('^$document') - sliceEnd = -10; - } - } - - if (sliceStart !== 0 || sliceEnd !== undefined) { - const sliced = line.slice(sliceStart, sliceEnd); - const domain = normalizeDomain(sliced); - if (domain) { - result[0] = domain; - result[1] = whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; - return result; - } - - result[0] = `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({ - line, sliced, sliceStart, sliceEnd, domain - })}`; - result[1] = ParseType.ErrorMessage; - return result; - } - - result[0] = `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({ - line, sliceStart, sliceEnd - })}`; - result[1] = ParseType.ErrorMessage; + result[1] = ParseType.Null; return result; } - if ( - // 124 `|` - // line.startsWith('|') - firstCharCode === 124 - && lineEndsWithCaretOrCaretVerticalBar - ) { - /** - * Some malformed filters can not be parsed by NetworkFilter: - * - * `||smetrics.teambeachbody.com^.com^` - * `||solutions.|pages.indigovision.com^` - * `||vystar..0rg@client.iebetanialaargentina.edu.co^` - * `app-uat.latrobehealth.com.au^predirect.snapdeal.com` - */ - - const includeAllSubDomain = line[1] === '|'; - - const sliceStart = includeAllSubDomain ? 2 : 1; - const sliceEnd = lineEndsWithCaret - ? -1 - : (lineEndsWithCaretVerticalBar ? -2 : undefined); - - const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|" - - const domain = normalizeDomain(sliced); - if (domain) { - result[0] = domain; - result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; - return result; - } - - result[0] = `[parse-filter E0002] (black) invalid domain: ${sliced}`; - result[1] = ParseType.ErrorMessage; - return result; + if (line.includes('badfilter', indexOfDollar + 1)) { + white = true; + } + if (line.includes('all', indexOfDollar + 1)) { + includeAllSubDomain = true; } - // if (line.endsWith('$image')) { - // /** - // * Some $image filters are not NetworkFilter: - // * - // * `app.site123.com$image` - // * `t.signaux$image` - // * `track.customer.io$image` - // */ - // } - const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.` - - if ( - lineStartsWithSingleDot - && lineEndsWithCaretOrCaretVerticalBar - ) { - /** - * `.ay.delivery^` - * `.m.bookben.com^` - * `.wap.x4399.com^` - */ - const sliced = line.slice( - 1, // remove prefix dot - lineEndsWithCaret // replaceAll('^', '') - ? -1 - : (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '') - ); - - const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt); - if (!suffix) { - // This exclude domain-like resource like `1.1.4.514.js` - result[1] = ParseType.Null; - return result; - } - - const domain = normalizeDomain(sliced); - if (domain) { - result[0] = domain; - result[1] = ParseType.BlackIncludeSubdomain; - return result; - } - - result[0] = `[parse-filter E0003] (black) invalid domain: ${JSON.stringify({ sliced, domain })}`; - result[1] = ParseType.ErrorMessage; - return result; - } - - /** - * `|http://x.o2.pl^` - * `://mine.torrent.pw^` - * `://say.ac^` - */ - if (lineEndsWithCaretOrCaretVerticalBar) { - let sliceStart = 0; - let sliceEnd; - if (lineEndsWithCaret) { // line.endsWith('^') - sliceEnd = -1; - } else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|') - sliceEnd = -2; - } - if (line.startsWith('://')) { - sliceStart = 3; - } else if (line.startsWith('http://')) { - sliceStart = 7; - } else if (line.startsWith('https://')) { - sliceStart = 8; - } else if (line.startsWith('|http://')) { - sliceStart = 8; - } else if (line.startsWith('|https://')) { - sliceStart = 9; - } - - if (sliceStart !== 0 || sliceEnd !== undefined) { - const sliced = line.slice(sliceStart, sliceEnd); - const domain = normalizeDomain(sliced); - if (domain) { - result[0] = domain; - result[1] = ParseType.BlackIncludeSubdomain; - return result; - } - - result[0] = `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({ - line, sliced, sliceStart, sliceEnd, domain - })}`; - result[1] = ParseType.ErrorMessage; - return result; - } - } /** * `_vmind.qqvideo.tc.qq.com^` * `arketing.indianadunes.com^` @@ -671,103 +541,54 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart * `-logging.nextmedia.com` * `_social_tracking.js^` */ - if ( - firstCharCode !== 124 // 124 `|` - && lastCharCode === 94 // 94 `^` - ) { - const _domain = line.slice(0, -1); + if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^` + /** line.endsWith('^') */ + sliceEnd -= 1; + } else if (line.charCodeAt(line.length + sliceEnd - 1) === 124) { // 124 `|` + /** line.endsWith('|') */ + sliceEnd -= 1; - const suffix = tldts.getPublicSuffix(_domain, looseTldtsOpt); - if (!suffix) { - // This exclude domain-like resource like `_social_tracking.js^` - result[1] = ParseType.Null; - return result; + if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^` + /** line.endsWith('^|') */ + sliceEnd -= 1; } - - const domain = normalizeDomain(_domain); - if (domain) { - result[0] = domain; - result[1] = ParseType.BlackAbsolute; - return result; - } - - result[0] = `[parse-filter E0005] (black) invalid domain: ${_domain}`; - result[1] = ParseType.ErrorMessage; - return result; - } - - // Possibly that entire rule is domain - - /** - * lineStartsWithSingleDot: - * - * `.cookielaw.js` - * `.content_tracking.js` - * `.ads.css` - * - * else: - * - * `_prebid.js` - * `t.yesware.com` - * `ubmcmm.baidustatic.com` - * `://www.smfg-card.$document` - * `portal.librus.pl$$advertisement-module` - * `@@-ds.metric.gstatic.com^|` - * `://gom.ge/cookie.js` - * `://accout-update-smba.jp.$document` - * `_200x250.png` - * `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg` - */ - let sliceStart = 0; - let sliceEnd = line.length; - let isWhieList = false; - - if (lineStartsWithSingleDot) { - // .usercentrics.eu^ - sliceStart = 1; - } else if (firstCharCode === 58 /** : */ && line.startsWith('://')) { - // ://backcb.one^$all - sliceStart = 3; - } - - if (line.endsWith('$all')) { - sliceEnd -= 4; - } else if (line.endsWith('$document')) { - sliceEnd -= 9; - } else if (line.endsWith('$badfilter')) { - isWhieList = true; - sliceEnd -= 10; - } - - const charBeforeModifier = line.charCodeAt(sliceEnd - 1); - if ( - charBeforeModifier === 94 /** ^$all, ^$document, etc. */ - || charBeforeModifier === 46 /** .$all */ - ) { + } else if (line.charCodeAt(line.length + sliceEnd - 1) === 46) { // 46 `.` + /** line.endsWith('.') */ sliceEnd -= 1; } - const sliced = (sliceStart !== 0 || sliceEnd !== line.length) ? line.slice(sliceStart, sliceEnd) : line; - - const tryNormalizeDomain = normalizeDomain(sliced); - if (tryNormalizeDomain === sliced) { - // the entire rule is domain - result[0] = sliced; - result[1] = isWhieList - ? ParseType.WhiteIncludeSubdomain - : ParseType.BlackIncludeSubdomain; - + const sliced = (sliceStart > 0 || sliceEnd < 0) ? line.slice(sliceStart, sliceEnd === 0 ? undefined : sliceEnd) : line; + if (sliced.charCodeAt(0) === 45 /* - */) { + // line.startsWith('-') is not a valid domain + result[1] = ParseType.ErrorMessage; + result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ + line, sliced, sliceStart, sliceEnd + })}`; return result; } - console.log({ - line, - lineEndsWithCaret, - lineEndsWithCaretOrCaretVerticalBar, - lineEndsWithCaretVerticalBar - }); + const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt); + if (!suffix) { + // This exclude domain-like resource like `_social_tracking.js^` + result[1] = ParseType.Null; + return result; + } - result[0] = `[parse-filter ${tryNormalizeDomain === null ? 'E0010' : 'E0011'}] can not parse: ${JSON.stringify({ line, tryNormalizeDomain, sliced, sliceStart, sliceEnd })}`; + const domain = normalizeDomain(sliced); + if (domain) { + result[0] = domain; + + if (white) { + result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; + } else { + result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; + } + return result; + } + + result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ + line, domain, suffix, sliced, sliceStart, sliceEnd + })}`; result[1] = ParseType.ErrorMessage; return result; } diff --git a/Source/non_ip/reject.conf b/Source/non_ip/reject.conf index e41d2f3c..c1b52d14 100644 --- a/Source/non_ip/reject.conf +++ b/Source/non_ip/reject.conf @@ -67,6 +67,14 @@ DOMAIN-KEYWORD,-attr.appsflyersdk.com DOMAIN-KEYWORD,-s2s.sensic.net DOMAIN-KEYWORD,-rtb.gravite.net +# >> Migrate from EasyPrivacy +DOMAIN-KEYWORD,analytics-cdn. +DOMAIN-KEYWORD,backstory.ebay. +DOMAIN-KEYWORD,click.rum. +DOMAIN-KEYWORD,cmpworker. +DOMAIN-KEYWORD,insights-collector. +DOMAIN-KEYWORD,track.opentable. + DOMAIN-WILDCARD,f-log*.grammarly.io DOMAIN-WILDCARD,*.ad.*.prod.hosts.ooklaserver.net