Refactor/Perf: rewrite how rules are salvaged

This commit is contained in:
SukkaW 2025-01-04 01:30:35 +08:00
parent ff6db02b99
commit 2643903b24
2 changed files with 159 additions and 330 deletions

View File

@ -300,7 +300,7 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
return result;
}
let line = $line.trim();
const line = $line.trim();
if (line.length === 0) {
result[1] = ParseType.Null;
@ -308,11 +308,14 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
}
const firstCharCode = line.charCodeAt(0);
let lastCharCode = line.charCodeAt(line.length - 1);
const lastCharCode = line.charCodeAt(line.length - 1);
if (
firstCharCode === 47 // 47 `/`
// ends with
// _160-600.
// -detect-adblock.
// _web-advert.
|| lastCharCode === 46 // 46 `.`, line.endsWith('.')
|| lastCharCode === 45 // 45 `-`, line.endsWith('-')
|| lastCharCode === 95 // 95 `_`, line.endsWith('_')
@ -405,46 +408,29 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
}
}
/**
* From now on, we are mostly facing non-standard domain rules (some are regex like)
*
* We can still salvage some of them by removing modifiers
*/
let sliceStart = 0;
let sliceEnd = 0;
// After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
// We now need to "salvage" the line as much as possible
/*
* From now on, we are mostly facing non-standard domain rules (some are regex like)
* We first skip third-party and frame rules, as Surge / Clash can't handle them
*
* `.sharecounter.$third-party`
* `.bbelements.com^$third-party`
* `://o0e.ru^$third-party`
* `.1.1.1.l80.js^$third-party`
*/
if (line.includes('$third-party')) {
if (!allowThirdParty) {
result[1] = ParseType.Null;
return result;
}
let white = false;
let includeAllSubDomain = false;
line = line
.replace('$third-party,', '$')
.replace('$third-party', '');
}
lastCharCode = line.charCodeAt(line.length - 1);
/** @example line.endsWith('^') */
const lineEndsWithCaret = lastCharCode === 94; // lastChar === '^';
/** @example line.endsWith('|') */
const lineEndsWithVerticalBar = lastCharCode === 124; // lastChar === '|';
/** @example line.endsWith('^|') */
const lineEndsWithCaretVerticalBar = lineEndsWithVerticalBar && line[line.length - 2] === '^';
/** @example line.endsWith('^') || line.endsWith('^|') */
const lineEndsWithCaretOrCaretVerticalBar = lineEndsWithCaret || lineEndsWithCaretVerticalBar;
// whitelist (exception)
if (
firstCharCode === 64 // 64 `@`
&& line[1] === '@'
&& line.charCodeAt(1) === 64 // 64 `@`
) {
let whiteIncludeAllSubDomain = true;
sliceStart += 2;
white = true;
includeAllSubDomain = true;
}
/**
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
@ -458,38 +444,55 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
* `@@||ad.alimama.com^$genericblock`
*/
let sliceStart = 0;
let sliceEnd: number | undefined;
switch (line.charCodeAt(sliceStart)) {
case 124: /** | */
// line.startsWith('@@|') || line.startsWith('|')
sliceStart += 1;
includeAllSubDomain = false;
switch (line[2]) {
case '|':
// line.startsWith('@@|')
sliceStart = 3;
whiteIncludeAllSubDomain = false;
if (line[3] === '|') { // line.startsWith('@@||')
sliceStart = 4;
whiteIncludeAllSubDomain = true;
if (line[sliceStart] === '|') { // line.startsWith('@@||') || line.startsWith('||')
sliceStart += 1;
includeAllSubDomain = true;
}
break;
case '.': { // line.startsWith('@@.')
sliceStart = 3;
whiteIncludeAllSubDomain = true;
break;
}
case ':': {
case 46: { /** | */ // line.startsWith('@@.') || line.startsWith('.')
/**
* `.ay.delivery^`
* `.m.bookben.com^`
* `.wap.x4399.com^`
*/
sliceStart += 1;
includeAllSubDomain = true;
break;
}
default:
break;
}
switch (line.charCodeAt(sliceStart)) {
case 58: { /** : */
/**
* line.startsWith('@@://')
*
* `@@://googleadservices.com^|`
* `@@://www.googleadservices.com^|`
* `://mine.torrent.pw^`
* `://say.ac^`
*/
if (line[3] === '/' && line[4] === '/') {
whiteIncludeAllSubDomain = false;
sliceStart = 5;
if (line[sliceStart + 1] === '/' && line[sliceStart + 2] === '/') {
includeAllSubDomain = false;
sliceStart += 3;
}
break;
}
case 104: { /** h */
/** |http://x.o2.pl^ */
if (line.startsWith('http://', sliceStart)) {
sliceStart += 7;
} else if (line.startsWith('https://', sliceStart)) {
sliceStart += 8;
}
break;
}
@ -498,170 +501,37 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
break;
}
if (lineEndsWithCaret) {
sliceEnd = -1;
} else if (lineEndsWithVerticalBar) {
// It is possible that a whitelist filter ends with '|' without '^|'
// @@|www.auslogics.com|
sliceEnd = lineEndsWithCaretVerticalBar ? -2 : -1;
} else if (line.endsWith('$genericblock')) {
sliceEnd = -13;
if (line[line.length - 14] === '^') { // line.endsWith('^$genericblock')
sliceEnd = -14;
}
} else if (line.endsWith('$document')) {
sliceEnd = -9;
if (line[line.length - 10] === '^') { // line.endsWith('^$document')
sliceEnd = -10;
}
const indexOfDollar = line.indexOf('$', sliceStart);
if (indexOfDollar > -1) {
sliceEnd = indexOfDollar - line.length;
}
if (sliceStart !== 0 || sliceEnd !== undefined) {
const sliced = line.slice(sliceStart, sliceEnd);
const domain = normalizeDomain(sliced);
if (domain) {
result[0] = domain;
result[1] = whiteIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
return result;
}
result[0] = `[parse-filter E0001] (white) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd, domain
})}`;
result[1] = ParseType.ErrorMessage;
return result;
}
result[0] = `[parse-filter E0006] (white) failed to parse: ${JSON.stringify({
line, sliceStart, sliceEnd
})}`;
result[1] = ParseType.ErrorMessage;
return result;
}
if (
// 124 `|`
// line.startsWith('|')
firstCharCode === 124
&& lineEndsWithCaretOrCaretVerticalBar
) {
/**
* Some malformed filters can not be parsed by NetworkFilter:
/*
* We skip third-party and frame rules, as Surge / Clash can't handle them
*
* `||smetrics.teambeachbody.com^.com^`
* `||solutions.|pages.indigovision.com^`
* `||vystar..0rg@client.iebetanialaargentina.edu.co^`
* `app-uat.latrobehealth.com.au^predirect.snapdeal.com`
* `.sharecounter.$third-party`
* `.bbelements.com^$third-party`
* `://o0e.ru^$third-party`
* `.1.1.1.l80.js^$third-party`
*/
const includeAllSubDomain = line[1] === '|';
const sliceStart = includeAllSubDomain ? 2 : 1;
const sliceEnd = lineEndsWithCaret
? -1
: (lineEndsWithCaretVerticalBar ? -2 : undefined);
const sliced = line.slice(sliceStart, sliceEnd); // we already make sure line startsWith "|"
const domain = normalizeDomain(sliced);
if (domain) {
result[0] = domain;
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
return result;
}
result[0] = `[parse-filter E0002] (black) invalid domain: ${sliced}`;
result[1] = ParseType.ErrorMessage;
return result;
}
// if (line.endsWith('$image')) {
// /**
// * Some $image filters are not NetworkFilter:
// *
// * `app.site123.com$image`
// * `t.signaux$image`
// * `track.customer.io$image`
// */
// }
const lineStartsWithSingleDot = firstCharCode === 46; // 46 `.`
if (
lineStartsWithSingleDot
&& lineEndsWithCaretOrCaretVerticalBar
!allowThirdParty
&& (
line.includes('third-party', indexOfDollar + 1)
|| line.includes('3p', indexOfDollar + 1)
)
) {
/**
* `.ay.delivery^`
* `.m.bookben.com^`
* `.wap.x4399.com^`
*/
const sliced = line.slice(
1, // remove prefix dot
lineEndsWithCaret // replaceAll('^', '')
? -1
: (lineEndsWithCaretVerticalBar ? -2 : undefined) // replace('^|', '')
);
const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
if (!suffix) {
// This exclude domain-like resource like `1.1.4.514.js`
result[1] = ParseType.Null;
return result;
}
const domain = normalizeDomain(sliced);
if (domain) {
result[0] = domain;
result[1] = ParseType.BlackIncludeSubdomain;
return result;
if (line.includes('badfilter', indexOfDollar + 1)) {
white = true;
}
if (line.includes('all', indexOfDollar + 1)) {
includeAllSubDomain = true;
}
result[0] = `[parse-filter E0003] (black) invalid domain: ${JSON.stringify({ sliced, domain })}`;
result[1] = ParseType.ErrorMessage;
return result;
}
/**
* `|http://x.o2.pl^`
* `://mine.torrent.pw^`
* `://say.ac^`
*/
if (lineEndsWithCaretOrCaretVerticalBar) {
let sliceStart = 0;
let sliceEnd;
if (lineEndsWithCaret) { // line.endsWith('^')
sliceEnd = -1;
} else if (lineEndsWithCaretVerticalBar) { // line.endsWith('^|')
sliceEnd = -2;
}
if (line.startsWith('://')) {
sliceStart = 3;
} else if (line.startsWith('http://')) {
sliceStart = 7;
} else if (line.startsWith('https://')) {
sliceStart = 8;
} else if (line.startsWith('|http://')) {
sliceStart = 8;
} else if (line.startsWith('|https://')) {
sliceStart = 9;
}
if (sliceStart !== 0 || sliceEnd !== undefined) {
const sliced = line.slice(sliceStart, sliceEnd);
const domain = normalizeDomain(sliced);
if (domain) {
result[0] = domain;
result[1] = ParseType.BlackIncludeSubdomain;
return result;
}
result[0] = `[parse-filter E0004] (black) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd, domain
})}`;
result[1] = ParseType.ErrorMessage;
return result;
}
}
/**
* `_vmind.qqvideo.tc.qq.com^`
* `arketing.indianadunes.com^`
@ -671,103 +541,54 @@ export function parse($line: string, result: [string, ParseType], allowThirdPart
* `-logging.nextmedia.com`
* `_social_tracking.js^`
*/
if (
firstCharCode !== 124 // 124 `|`
&& lastCharCode === 94 // 94 `^`
) {
const _domain = line.slice(0, -1);
if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
/** line.endsWith('^') */
sliceEnd -= 1;
} else if (line.charCodeAt(line.length + sliceEnd - 1) === 124) { // 124 `|`
/** line.endsWith('|') */
sliceEnd -= 1;
const suffix = tldts.getPublicSuffix(_domain, looseTldtsOpt);
if (line.charCodeAt(line.length + sliceEnd - 1) === 94) { // 94 `^`
/** line.endsWith('^|') */
sliceEnd -= 1;
}
} else if (line.charCodeAt(line.length + sliceEnd - 1) === 46) { // 46 `.`
/** line.endsWith('.') */
sliceEnd -= 1;
}
const sliced = (sliceStart > 0 || sliceEnd < 0) ? line.slice(sliceStart, sliceEnd === 0 ? undefined : sliceEnd) : line;
if (sliced.charCodeAt(0) === 45 /* - */) {
// line.startsWith('-') is not a valid domain
result[1] = ParseType.ErrorMessage;
result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd
})}`;
return result;
}
const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt);
if (!suffix) {
// This exclude domain-like resource like `_social_tracking.js^`
result[1] = ParseType.Null;
return result;
}
const domain = normalizeDomain(_domain);
const domain = normalizeDomain(sliced);
if (domain) {
result[0] = domain;
result[1] = ParseType.BlackAbsolute;
if (white) {
result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
} else {
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
}
return result;
}
result[0] = `[parse-filter E0005] (black) invalid domain: ${_domain}`;
result[1] = ParseType.ErrorMessage;
return result;
}
// Possibly that entire rule is domain
/**
* lineStartsWithSingleDot:
*
* `.cookielaw.js`
* `.content_tracking.js`
* `.ads.css`
*
* else:
*
* `_prebid.js`
* `t.yesware.com`
* `ubmcmm.baidustatic.com`
* `://www.smfg-card.$document`
* `portal.librus.pl$$advertisement-module`
* `@@-ds.metric.gstatic.com^|`
* `://gom.ge/cookie.js`
* `://accout-update-smba.jp.$document`
* `_200x250.png`
* `@@://www.liquidweb.com/kb/wp-content/themes/lw-kb-theme/images/ads/vps-sidebar.jpg`
*/
let sliceStart = 0;
let sliceEnd = line.length;
let isWhieList = false;
if (lineStartsWithSingleDot) {
// .usercentrics.eu^
sliceStart = 1;
} else if (firstCharCode === 58 /** : */ && line.startsWith('://')) {
// ://backcb.one^$all
sliceStart = 3;
}
if (line.endsWith('$all')) {
sliceEnd -= 4;
} else if (line.endsWith('$document')) {
sliceEnd -= 9;
} else if (line.endsWith('$badfilter')) {
isWhieList = true;
sliceEnd -= 10;
}
const charBeforeModifier = line.charCodeAt(sliceEnd - 1);
if (
charBeforeModifier === 94 /** ^$all, ^$document, etc. */
|| charBeforeModifier === 46 /** .$all */
) {
sliceEnd -= 1;
}
const sliced = (sliceStart !== 0 || sliceEnd !== line.length) ? line.slice(sliceStart, sliceEnd) : line;
const tryNormalizeDomain = normalizeDomain(sliced);
if (tryNormalizeDomain === sliced) {
// the entire rule is domain
result[0] = sliced;
result[1] = isWhieList
? ParseType.WhiteIncludeSubdomain
: ParseType.BlackIncludeSubdomain;
return result;
}
console.log({
line,
lineEndsWithCaret,
lineEndsWithCaretOrCaretVerticalBar,
lineEndsWithCaretVerticalBar
});
result[0] = `[parse-filter ${tryNormalizeDomain === null ? 'E0010' : 'E0011'}] can not parse: ${JSON.stringify({ line, tryNormalizeDomain, sliced, sliceStart, sliceEnd })}`;
result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
line, domain, suffix, sliced, sliceStart, sliceEnd
})}`;
result[1] = ParseType.ErrorMessage;
return result;
}

View File

@ -67,6 +67,14 @@ DOMAIN-KEYWORD,-attr.appsflyersdk.com
DOMAIN-KEYWORD,-s2s.sensic.net
DOMAIN-KEYWORD,-rtb.gravite.net
# >> Migrate from EasyPrivacy
DOMAIN-KEYWORD,analytics-cdn.
DOMAIN-KEYWORD,backstory.ebay.
DOMAIN-KEYWORD,click.rum.
DOMAIN-KEYWORD,cmpworker.
DOMAIN-KEYWORD,insights-collector.
DOMAIN-KEYWORD,track.opentable.
DOMAIN-WILDCARD,f-log*.grammarly.io
DOMAIN-WILDCARD,*.ad.*.prod.hosts.ooklaserver.net