Improve parsing (ready for reject ip and reject wildcard)
Some checks failed
Build / Build (push) Has been cancelled
Build / Diff output (push) Has been cancelled
Build / Deploy to Cloudflare Pages (3.114.9) (push) Has been cancelled
Build / Deploy to GitHub and GitLab (push) Has been cancelled

This commit is contained in:
SukkaW 2025-06-20 15:26:14 +08:00
parent 2ba50bf850
commit 172e4efd8a
2 changed files with 76 additions and 24 deletions

View File

@ -3,9 +3,9 @@ import type * as tldts from 'tldts';
export const looseTldtsOpt: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = { export const looseTldtsOpt: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = {
allowPrivateDomains: false, allowPrivateDomains: false,
extractHostname: false, extractHostname: false,
mixedInputs: false,
validateHostname: false, validateHostname: false,
detectIp: false, detectIp: false
mixedInputs: false
}; };
export const loosTldOptWithPrivateDomains: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = { export const loosTldOptWithPrivateDomains: NonNullable<Parameters<typeof tldts.getSubdomain>[1]> = {

View File

@ -4,9 +4,8 @@ import { fetchAssets } from '../fetch-assets';
import { onBlackFound, onWhiteFound } from './shared'; import { onBlackFound, onWhiteFound } from './shared';
import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie';
import { looseTldtsOpt } from '../../constants/loose-tldts-opt'; import { looseTldtsOpt } from '../../constants/loose-tldts-opt';
import tldts from 'tldts-experimental'; import tldts from 'tldts';
import { NetworkFilter } from '@ghostery/adblocker'; import { NetworkFilter } from '@ghostery/adblocker';
import { fastNormalizeDomain, fastNormalizeDomainWithoutWww, fastNormalizeDomainWithoutWwwNoIP } from '../normalize-domain';
import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip'; import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip';
const enum ParseType { const enum ParseType {
@ -232,16 +231,31 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
&& filter.isPlain() // isPlain() === !isRegex() && filter.isPlain() // isPlain() === !isRegex()
&& (!filter.isFullRegex()) && (!filter.isFullRegex())
) { ) {
const white = filter.isException() || filter.isBadFilter();
// We don't want tldts to call its own "extractHostname" on ip, bail out ip first. // We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
// Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) { if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) {
if (white) {
// We do not support whitelist IP anyway.
result[1] = ParseType.Null;
return result;
}
result[0] = filter.hostname; result[0] = filter.hostname;
result[1] = ParseType.BlackIP; result[1] = ParseType.BlackIP;
return result; return result;
} }
const hostname = fastNormalizeDomainWithoutWwwNoIP(filter.hostname); const parsed = tldts.parse(filter.hostname, looseTldtsOpt);
if (!hostname) {
/**
* We can exclude wildcard in TLD
*
* ||example.*
*
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
*/
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
result[1] = ParseType.Null; result[1] = ParseType.Null;
return result; return result;
} }
@ -251,12 +265,23 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
// |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp()) // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp())
const isIncludeAllSubDomain = filter.isHostnameAnchor(); const isIncludeAllSubDomain = filter.isHostnameAnchor();
if (filter.isException() || filter.isBadFilter()) { let hostname = parsed.hostname;
result[0] = hostname; if (white) {
result[0] = filter.hostname;
result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
return result; return result;
} }
// we only strip www when it is blacklist
if (parsed.subdomain) {
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
hostname = parsed.domain;
}
if (parsed.subdomain.startsWith('www.')) {
hostname = parsed.subdomain.slice(4) + '.' + parsed.domain;
}
}
const _1p = filter.firstParty(); const _1p = filter.firstParty();
const _3p = filter.thirdParty(); const _3p = filter.thirdParty();
@ -439,38 +464,65 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa
return result; return result;
} }
if (sliced.charCodeAt(0) === 45 /* - */) { // We don't want tldts to call its own "extractHostname" on ip, bail out ip first.
// line.startsWith('-') is not a valid domain // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false.
result[1] = ParseType.ErrorMessage; if (isProbablyIpv4(sliced) || isProbablyIpv6(sliced)) {
result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ // TODO: we might want to implements reject ip in the future
result[0] = `[parse-filter E0002] (${white ? 'white' : 'black'}) ip: ${JSON.stringify({
line, sliced, sliceStart, sliceEnd line, sliced, sliceStart, sliceEnd
})}`; })}`;
result[1] = ParseType.ErrorMessage;
return result; return result;
} }
const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt); const parsed = tldts.parse(sliced, looseTldtsOpt);
if (!suffix) {
// This exclude domain-like resource like `_social_tracking.js^` /**
* We can exclude wildcard in TLD
*
* ||example.*
*
* We can also exclude URL path pattern like this, since TLD and file extension don't overlapped
*
* -ad.css
* -ad.js
*
* This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc.
*/
if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) {
result[1] = ParseType.Null; result[1] = ParseType.Null;
return result; return result;
} }
const normalizer = white ? fastNormalizeDomain : fastNormalizeDomainWithoutWww; // no wildcard, we can safely normalize it˝
const domain = normalizer(sliced); if (!parsed.hostname.includes('*')) {
if (domain) {
result[0] = domain;
if (white) { if (white) {
result[0] = parsed.hostname;
result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute;
} else { return result;
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
} }
// blacklist, we can strip www from subdomain
if (parsed.subdomain) {
if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') {
result[0] = parsed.domain;
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
return result;
}
if (parsed.subdomain.startsWith('www.')) {
result[0] = parsed.subdomain.slice(4) + '.' + parsed.domain;
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
return result;
}
}
result[0] = parsed.hostname;
result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute;
return result; return result;
} }
result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({
line, domain, suffix, sliced, sliceStart, sliceEnd line, sliced, sliceStart, sliceEnd, parsed
})}`; })}`;
result[1] = ParseType.ErrorMessage; result[1] = ParseType.ErrorMessage;
return result; return result;