mirror of
https://github.com/SukkaW/Surge.git
synced 2025-12-13 01:30:37 +08:00
Improve filter parsing
This commit is contained in:
parent
6596ff074f
commit
43e373449f
@ -264,83 +264,64 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
filter.hostname // filter.hasHostname() // must have
|
filter.hostname !== undefined // filter.hasHostname() // must have
|
||||||
&& filter.isPlain() // isPlain() === !isRegex()
|
&& filter.isPlain() // isPlain() === !isRegex()
|
||||||
&& (!filter.isFullRegex())
|
// ghostry run some strict checks again invalid syntax and marked them as regex as well
|
||||||
|
// https://github.com/ghostery/adblocker/blob/bfffdce89e741e7aa010de3759b4b536b7c23430/packages/adblocker/src/filters/network.ts#L1103
|
||||||
|
// So instead we manually salvage them instead of relying on them
|
||||||
|
// && (!filter.isRegex())
|
||||||
|
// && (!filter.isFullRegex()) // pattern starts and ends with "/", we can't parse this
|
||||||
) {
|
) {
|
||||||
|
const _1p = filter.firstParty();
|
||||||
|
const _3p = filter.thirdParty();
|
||||||
const white = filter.isException() || filter.isBadFilter();
|
const white = filter.isException() || filter.isBadFilter();
|
||||||
|
|
||||||
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
|
|
||||||
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
|
||||||
if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) {
|
|
||||||
if (white) {
|
if (white) {
|
||||||
// We do not support whitelist IP anyway.
|
return onHostname(
|
||||||
result[1] = ParseType.Null;
|
filter.hostname,
|
||||||
return result;
|
white,
|
||||||
}
|
|
||||||
result[0] = filter.hostname;
|
|
||||||
result[1] = ParseType.BlackIP;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
const parsed = tldts.parse(filter.hostname, looseTldtsOpt);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* We can exclude wildcard in TLD
|
|
||||||
*
|
|
||||||
* ||example.*
|
|
||||||
*
|
|
||||||
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
|
|
||||||
*/
|
|
||||||
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
|
|
||||||
result[1] = ParseType.Null;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
// |: filter.isHostnameAnchor(),
|
// |: filter.isHostnameAnchor(),
|
||||||
// |: filter.isLeftAnchor(),
|
// |: filter.isLeftAnchor(),
|
||||||
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||||
const isIncludeAllSubDomain = filter.isHostnameAnchor();
|
filter.isHostnameAnchor(),
|
||||||
|
line,
|
||||||
let hostname = parsed.hostname;
|
result
|
||||||
if (white) {
|
);
|
||||||
result[0] = filter.hostname;
|
|
||||||
result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// we only strip www when it is blacklist
|
|
||||||
if (parsed.subdomain) {
|
|
||||||
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
|
|
||||||
hostname = parsed.domain;
|
|
||||||
}
|
|
||||||
if (parsed.subdomain.startsWith('www.')) {
|
|
||||||
hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const _1p = filter.firstParty();
|
|
||||||
const _3p = filter.thirdParty();
|
|
||||||
|
|
||||||
if (_1p) { // first party is true
|
|
||||||
if (_3p) { // third party is also true
|
|
||||||
result[0] = hostname;
|
|
||||||
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
result[1] = ParseType.Null;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
if (_3p) {
|
if (_3p) {
|
||||||
if (includeThirdParty) {
|
if (_1p || includeThirdParty) { // both first party and third party are true
|
||||||
result[0] = hostname;
|
// only then we run onHostname
|
||||||
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
return onHostname(
|
||||||
return result;
|
filter.hostname,
|
||||||
|
white,
|
||||||
|
// |: filter.isHostnameAnchor(),
|
||||||
|
// |: filter.isLeftAnchor(),
|
||||||
|
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||||
|
filter.isHostnameAnchor(),
|
||||||
|
line,
|
||||||
|
result
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// only third party is true and w/o first party, there is no need to run onHostname anyway
|
||||||
result[1] = ParseType.Null;
|
result[1] = ParseType.Null;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// third party is already false
|
||||||
|
if (_1p) { // first part only
|
||||||
|
return onHostname(
|
||||||
|
filter.hostname,
|
||||||
|
white,
|
||||||
|
// |: filter.isHostnameAnchor(),
|
||||||
|
// |: filter.isLeftAnchor(),
|
||||||
|
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
|
||||||
|
filter.isHostnameAnchor(),
|
||||||
|
line,
|
||||||
|
result
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -353,7 +334,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
let sliceStart = 0;
|
let sliceStart = 0;
|
||||||
let sliceEnd = 0;
|
let sliceEnd = 0;
|
||||||
|
|
||||||
// After NetworkFilter.parse, it means the line can not be parsed by cliqz NetworkFilter
|
// After NetworkFilter.parse, it means the line can not be parsed by ghostry NetworkFilter
|
||||||
// We now need to "salvage" the line as much as possible
|
// We now need to "salvage" the line as much as possible
|
||||||
|
|
||||||
let white = false;
|
let white = false;
|
||||||
@ -370,14 +351,10 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
|
* Some "malformed" regex-based filters can not be parsed by NetworkFilter
|
||||||
* "$genericblock`" is also not supported by NetworkFilter, see:
|
|
||||||
* https://github.com/ghostery/adblocker/blob/62caf7786ba10ef03beffecd8cd4eec111bcd5ec/packages/adblocker/test/parsing.test.ts#L950
|
|
||||||
*
|
*
|
||||||
* `@@||cmechina.net^$genericblock`
|
|
||||||
* `@@|ftp.bmp.ovh^|`
|
* `@@|ftp.bmp.ovh^|`
|
||||||
* `@@|adsterra.com^|`
|
* `@@|adsterra.com^|`
|
||||||
* `@@.atlassian.net$document`
|
* `@@.atlassian.net$document`
|
||||||
* `@@||ad.alimama.com^$genericblock`
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
switch (line.charCodeAt(sliceStart)) {
|
switch (line.charCodeAt(sliceStart)) {
|
||||||
@ -501,19 +478,33 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return onHostname(sliced, white, includeAllSubDomain, line, result);
|
||||||
|
}
|
||||||
|
|
||||||
|
function onHostname(
|
||||||
|
input: string,
|
||||||
|
white: boolean,
|
||||||
|
isIncludeAllSubDomain: boolean,
|
||||||
|
rawLine: string,
|
||||||
|
result: [string, ParseType]
|
||||||
|
) {
|
||||||
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
|
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
|
||||||
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
if (isProbablyIpv4(input) || isProbablyIpv6(input)) {
|
||||||
if (isProbablyIpv4(sliced) || isProbablyIpv6(sliced)) {
|
if (white) {
|
||||||
// TODO: we might want to implements reject ip in the future
|
// We do not support whitelist IP anyway.
|
||||||
result[0] = `[parse-filter E0002] (${white ? 'white' : 'black'}) ip: ${JSON.stringify({
|
result[0] = `[parse-filter E0022] (white) no whitelist ip support: ${JSON.stringify({
|
||||||
line, sliced, sliceStart, sliceEnd
|
input, rawLine
|
||||||
})}`;
|
})}`;
|
||||||
result[1] = ParseType.ErrorMessage;
|
result[1] = ParseType.ErrorMessage;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
result[0] = input;
|
||||||
|
result[1] = ParseType.BlackIP;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
|
||||||
|
|
||||||
const parsed = tldts.parse(sliced, looseTldtsOpt);
|
const parsed = tldts.parse(input, looseTldtsOpt);
|
||||||
const hostname = parsed.hostname;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* We can exclude wildcard in TLD
|
* We can exclude wildcard in TLD
|
||||||
@ -527,12 +518,14 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
*
|
*
|
||||||
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
|
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
|
||||||
*/
|
*/
|
||||||
if (!parsed.publicSuffix || !parsed.isIcann || !hostname || !parsed.domain) {
|
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
|
||||||
result[1] = ParseType.Null;
|
result[1] = ParseType.Null;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// no wildcard, we can safely normalize it˝
|
let hostname = parsed.hostname;
|
||||||
|
|
||||||
|
// no wildcard, we can safely normalize it
|
||||||
if (!hostname.includes('*')) {
|
if (!hostname.includes('*')) {
|
||||||
if (hostname.charCodeAt(0) === 45) { // 45 `-`
|
if (hostname.charCodeAt(0) === 45) { // 45 `-`
|
||||||
result[0] = hostname;
|
result[0] = hostname;
|
||||||
@ -542,26 +535,21 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
|
|
||||||
if (white) {
|
if (white) {
|
||||||
result[0] = hostname;
|
result[0] = hostname;
|
||||||
result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
// blacklist, we can strip www from subdomain
|
// we only strip www when it is blacklist
|
||||||
if (parsed.subdomain) {
|
if (parsed.subdomain) {
|
||||||
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
|
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
|
||||||
result[0] = parsed.domain;
|
hostname = parsed.domain;
|
||||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
} else if (parsed.subdomain.startsWith('www.')) {
|
||||||
return result;
|
hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
||||||
}
|
|
||||||
if (parsed.subdomain.startsWith('www.')) {
|
|
||||||
result[0] = parsed.subdomain.slice(4) + '.' + parsed.domain;
|
|
||||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
result[0] = hostname;
|
result[0] = hostname;
|
||||||
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
result[1] = isIncludeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -571,7 +559,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
// result[1] = ParseType.Null;
|
// result[1] = ParseType.Null;
|
||||||
// return result;
|
// return result;
|
||||||
result[0] = `[parse-filter E0021] wildcard whitelist not supported: ${JSON.stringify({
|
result[0] = `[parse-filter E0021] wildcard whitelist not supported: ${JSON.stringify({
|
||||||
line, sliced, sliceStart, sliceEnd, parsed
|
input, rawLine, parsed
|
||||||
})}`;
|
})}`;
|
||||||
result[1] = ParseType.ErrorMessage;
|
result[1] = ParseType.ErrorMessage;
|
||||||
return result;
|
return result;
|
||||||
@ -593,12 +581,17 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
|
|||||||
}
|
}
|
||||||
|
|
||||||
result[0] = `[parse-filter E0020] (black) invalid wildcard domain: ${JSON.stringify({
|
result[0] = `[parse-filter E0020] (black) invalid wildcard domain: ${JSON.stringify({
|
||||||
line, sliced, sliceStart, sliceEnd, parsed
|
input, rawLine, parsed
|
||||||
})}`;
|
})}`;
|
||||||
result[1] = ParseType.ErrorMessage;
|
result[1] = ParseType.ErrorMessage;
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (hostname.charCodeAt(0) === 45) { // 45 `-`
|
||||||
|
// starts with - and also containing * wildcard
|
||||||
|
hostname = '*' + hostname;
|
||||||
|
}
|
||||||
|
|
||||||
result[0] = hostname;
|
result[0] = hostname;
|
||||||
result[1] = ParseType.BlackWildcard;
|
result[1] = ParseType.BlackWildcard;
|
||||||
return result;
|
return result;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user