mirror of
https://github.com/SukkaW/Surge.git
synced 2025-12-12 09:10:35 +08:00
Improve filter parsing
This commit is contained in:
parent
6596ff074f
commit
43e373449f
@ -264,82 +264,63 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
}
|
||||
|
||||
if (
|
||||
filter.hostname // filter.hasHostname() // must have
|
||||
filter.hostname !== undefined // filter.hasHostname() // must have
|
||||
&& filter.isPlain() // isPlain() === !isRegex()
|
||||
&& (!filter.isFullRegex())
|
||||
// ghostry run some strict checks again invalid syntax and marked them as regex as well
|
||||
// https://github.com/ghostery/adblocker/blob/bfffdce89e741e7aa010de3759b4b536b7c23430/packages/adblocker/src/filters/network.ts#L1103
|
||||
// So instead we manually salvage them instead of relying on them
|
||||
// && (!filter.isRegex())
|
||||
// && (!filter.isFullRegex()) // pattern starts and ends with "/", we can't parse this
|
||||
) {
|
||||
const white = filter.isException() || filter.isBadFilter();
|
||||
|
||||
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
|
||||
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
||||
if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) {
|
||||
if (white) {
|
||||
// We do not support whitelist IP anyway.
|
||||
result[1] = ParseType.Null;
|
||||
return result;
|
||||
}
|
||||
result[0] = filter.hostname;
|
||||
result[1] = ParseType.BlackIP;
|
||||
return result;
|
||||
}
|
||||
|
||||
const parsed = tldts.parse(filter.hostname, looseTldtsOpt);
|
||||
|
||||
/**
|
||||
* We can exclude wildcard in TLD
|
||||
*
|
||||
* ||example.*
|
||||
*
|
||||
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
|
||||
*/
|
||||
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
|
||||
result[1] = ParseType.Null;
|
||||
return result;
|
||||
}
|
||||
|
||||
// |: filter.isHostnameAnchor(),
|
||||
// |: filter.isLeftAnchor(),
|
||||
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||
const isIncludeAllSubDomain = filter.isHostnameAnchor();
|
||||
|
||||
let hostname = parsed.hostname;
|
||||
if (white) {
|
||||
result[0] = filter.hostname;
|
||||
result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
||||
return result;
|
||||
}
|
||||
|
||||
// we only strip www when it is blacklist
|
||||
if (parsed.subdomain) {
|
||||
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
|
||||
hostname = parsed.domain;
|
||||
}
|
||||
if (parsed.subdomain.startsWith('www.')) {
|
||||
hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
||||
}
|
||||
}
|
||||
|
||||
const _1p = filter.firstParty();
|
||||
const _3p = filter.thirdParty();
|
||||
const white = filter.isException() || filter.isBadFilter();
|
||||
|
||||
if (_1p) { // first party is true
|
||||
if (_3p) { // third party is also true
|
||||
result[0] = hostname;
|
||||
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
if (white) {
|
||||
return onHostname(
|
||||
filter.hostname,
|
||||
white,
|
||||
// |: filter.isHostnameAnchor(),
|
||||
// |: filter.isLeftAnchor(),
|
||||
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||
filter.isHostnameAnchor(),
|
||||
line,
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
return result;
|
||||
if (_3p) {
|
||||
if (_1p || includeThirdParty) { // both first party and third party are true
|
||||
// only then we run onHostname
|
||||
return onHostname(
|
||||
filter.hostname,
|
||||
white,
|
||||
// |: filter.isHostnameAnchor(),
|
||||
// |: filter.isLeftAnchor(),
|
||||
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||
filter.isHostnameAnchor(),
|
||||
line,
|
||||
result
|
||||
);
|
||||
}
|
||||
|
||||
// only third party is true and w/o first party, there is no need to run onHostname anyway
|
||||
result[1] = ParseType.Null;
|
||||
return result;
|
||||
}
|
||||
if (_3p) {
|
||||
if (includeThirdParty) {
|
||||
result[0] = hostname;
|
||||
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
return result;
|
||||
}
|
||||
result[1] = ParseType.Null;
|
||||
return result;
|
||||
|
||||
// third party is already false
|
||||
if (_1p) { // first part only
|
||||
return onHostname(
|
||||
filter.hostname,
|
||||
white,
|
||||
// |: filter.isHostnameAnchor(),
|
||||
// |: filter.isLeftAnchor(),
|
||||
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||
filter.isHostnameAnchor(),
|
||||
line,
|
||||
result
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -353,7 +334,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
let sliceStart = 0;
|
||||
let sliceEnd = 0;
|
||||
|
||||
// After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
|
||||
// After NetworkFilter.parse, it means the line can not be parsed by ghostry NetworkFilter
|
||||
// We now need to "salvage" the line as much as possible
|
||||
|
||||
let white = false;
|
||||
@ -370,14 +351,10 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
|
||||
/**
|
||||
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
|
||||
* "$genericblock`" is also not supported by NetworkFilter, see:
|
||||
* https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
|
||||
*
|
||||
* `@@||cmechina.net^$genericblock`
|
||||
* `@@|ftp.bmp.ovh^|`
|
||||
* `@@|adsterra.com^|`
|
||||
* `@@.atlassian.net$document`
|
||||
* `@@||ad.alimama.com^$genericblock`
|
||||
*/
|
||||
|
||||
switch (line.charCodeAt(sliceStart)) {
|
||||
@ -501,19 +478,33 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
return result;
|
||||
}
|
||||
|
||||
return onHostname(sliced, white, includeAllSubDomain, line, result);
|
||||
}
|
||||
|
||||
function onHostname(
|
||||
input: string,
|
||||
white: boolean,
|
||||
isIncludeAllSubDomain: boolean,
|
||||
rawLine: string,
|
||||
result: [string, ParseType]
|
||||
) {
|
||||
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
|
||||
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
||||
if (isProbablyIpv4(sliced) || isProbablyIpv6(sliced)) {
|
||||
// TODO: we might want to implements reject ip in the future
|
||||
result[0] = `[parse-filter E0002] (${white ? 'white' : 'black'}) ip: ${JSON.stringify({
|
||||
line, sliced, sliceStart, sliceEnd
|
||||
})}`;
|
||||
result[1] = ParseType.ErrorMessage;
|
||||
if (isProbablyIpv4(input) || isProbablyIpv6(input)) {
|
||||
if (white) {
|
||||
// We do not support whitelist IP anyway.
|
||||
result[0] = `[parse-filter E0022] (white) no whitelist ip support: ${JSON.stringify({
|
||||
input, rawLine
|
||||
})}`;
|
||||
result[1] = ParseType.ErrorMessage;
|
||||
return result;
|
||||
}
|
||||
result[0] = input;
|
||||
result[1] = ParseType.BlackIP;
|
||||
return result;
|
||||
}
|
||||
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
||||
|
||||
const parsed = tldts.parse(sliced, looseTldtsOpt);
|
||||
const hostname = parsed.hostname;
|
||||
const parsed = tldts.parse(input, looseTldtsOpt);
|
||||
|
||||
/**
|
||||
* We can exclude wildcard in TLD
|
||||
@ -527,12 +518,14 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
*
|
||||
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
|
||||
*/
|
||||
if (!parsed.publicSuffix || !parsed.isIcann || !hostname || !parsed.domain) {
|
||||
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
|
||||
result[1] = ParseType.Null;
|
||||
return result;
|
||||
}
|
||||
|
||||
// no wildcard, we can safely normalize it˝
|
||||
let hostname = parsed.hostname;
|
||||
|
||||
// no wildcard, we can safely normalize it
|
||||
if (!hostname.includes('*')) {
|
||||
if (hostname.charCodeAt(0) === 45) { // 45 `-`
|
||||
result[0] = hostname;
|
||||
@ -542,26 +535,21 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
|
||||
if (white) {
|
||||
result[0] = hostname;
|
||||
result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
||||
result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
||||
return result;
|
||||
}
|
||||
|
||||
// blacklist, we can strip www from subdomain
|
||||
// we only strip www when it is blacklist
|
||||
if (parsed.subdomain) {
|
||||
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
|
||||
result[0] = parsed.domain;
|
||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
return result;
|
||||
}
|
||||
if (parsed.subdomain.startsWith('www.')) {
|
||||
result[0] = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
return result;
|
||||
hostname = parsed.domain;
|
||||
} else if (parsed.subdomain.startsWith('www.')) {
|
||||
hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
||||
}
|
||||
}
|
||||
|
||||
result[0] = hostname;
|
||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -571,7 +559,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
// result[1] = ParseType.Null;
|
||||
// return result;
|
||||
result[0] = `[parse-filter E0021] wildcard whitelist not supported: ${JSON.stringify({
|
||||
line, sliced, sliceStart, sliceEnd, parsed
|
||||
input, rawLine, parsed
|
||||
})}`;
|
||||
result[1] = ParseType.ErrorMessage;
|
||||
return result;
|
||||
@ -593,12 +581,17 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
||||
}
|
||||
|
||||
result[0] = `[parse-filter E0020] (black) invalid wildcard domain: ${JSON.stringify({
|
||||
line, sliced, sliceStart, sliceEnd, parsed
|
||||
input, rawLine, parsed
|
||||
})}`;
|
||||
result[1] = ParseType.ErrorMessage;
|
||||
return result;
|
||||
}
|
||||
|
||||
if (hostname.charCodeAt(0) === 45) { // 45 `-`
|
||||
// starts with - and also containing * wildcard
|
||||
hostname = '*' + hostname;
|
||||
}
|
||||
|
||||
result[0] = hostname;
|
||||
result[1] = ParseType.BlackWildcard;
|
||||
return result;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user