From 172e4efd8a8baf78dc9f2007b3178915fa15c0c8 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Fri, 20 Jun 2025 15:26:14 +0800 Subject: [PATCH] Improve parsing (ready for reject ip and reject wildcard) --- Build/constants/loose-tldts-opt.ts | 4 +- Build/lib/parse-filter/filters.ts | 96 +++++++++++++++++++++++------- 2 files changed, 76 insertions(+), 24 deletions(-) diff --git a/Build/constants/loose-tldts-opt.ts b/Build/constants/loose-tldts-opt.ts index e51021c5..d6e870c2 100644 --- a/Build/constants/loose-tldts-opt.ts +++ b/Build/constants/loose-tldts-opt.ts @@ -3,9 +3,9 @@ import type * as tldts from 'tldts'; export const looseTldtsOpt: NonNullable[1]> = { allowPrivateDomains: false, extractHostname: false, + mixedInputs: false, validateHostname: false, - detectIp: false, - mixedInputs: false + detectIp: false }; export const loosTldOptWithPrivateDomains: NonNullable[1]> = { diff --git a/Build/lib/parse-filter/filters.ts b/Build/lib/parse-filter/filters.ts index 8a7402b1..7d7071f3 100644 --- a/Build/lib/parse-filter/filters.ts +++ b/Build/lib/parse-filter/filters.ts @@ -4,9 +4,8 @@ import { fetchAssets } from '../fetch-assets'; import { onBlackFound, onWhiteFound } from './shared'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; import { looseTldtsOpt } from '../../constants/loose-tldts-opt'; -import tldts from 'tldts-experimental'; +import tldts from 'tldts'; import { NetworkFilter } from '@ghostery/adblocker'; -import { fastNormalizeDomain, fastNormalizeDomainWithoutWww, fastNormalizeDomainWithoutWwwNoIP } from '../normalize-domain'; import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip'; const enum ParseType { @@ -232,16 +231,31 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa && filter.isPlain() // isPlain() === !isRegex() && (!filter.isFullRegex()) ) { + const white = filter.isException() || filter.isBadFilter(); + // We don't want tldts to call its own "extractHostname" on ip, bail out ip first. // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. if (isProbablyIpv4(filter.hostname) || isProbablyIpv6(filter.hostname)) { + if (white) { + // We do not support whitelist IP anyway. + result[1] = ParseType.Null; + return result; + } result[0] = filter.hostname; result[1] = ParseType.BlackIP; return result; } - const hostname = fastNormalizeDomainWithoutWwwNoIP(filter.hostname); - if (!hostname) { + const parsed = tldts.parse(filter.hostname, looseTldtsOpt); + + /** + * We can exclude wildcard in TLD + * + * ||example.* + * + * This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc. + */ + if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) { result[1] = ParseType.Null; return result; } @@ -251,12 +265,23 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa // |https://: !filter.isHostnameAnchor() && (filter.fromHttps() || filter.fromHttp()) const isIncludeAllSubDomain = filter.isHostnameAnchor(); - if (filter.isException() || filter.isBadFilter()) { - result[0] = hostname; + let hostname = parsed.hostname; + if (white) { + result[0] = filter.hostname; result[1] = isIncludeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; return result; } + // we only strip www when it is blacklist + if (parsed.subdomain) { + if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') { + hostname = parsed.domain; + } + if (parsed.subdomain.startsWith('www.')) { + hostname = parsed.subdomain.slice(4) + '.' + parsed.domain; + } + } + const _1p = filter.firstParty(); const _3p = filter.thirdParty(); @@ -439,38 +464,65 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa return result; } - if (sliced.charCodeAt(0) === 45 /* - */) { - // line.startsWith('-') is not a valid domain - result[1] = ParseType.ErrorMessage; - result[0] = `[parse-filter E0001] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ + // We don't want tldts to call its own "extractHostname" on ip, bail out ip first. + // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. + if (isProbablyIpv4(sliced) || isProbablyIpv6(sliced)) { + // TODO: we might want to implements reject ip in the future + result[0] = `[parse-filter E0002] (${white ? 'white' : 'black'}) ip: ${JSON.stringify({ line, sliced, sliceStart, sliceEnd })}`; + result[1] = ParseType.ErrorMessage; return result; } - const suffix = tldts.getPublicSuffix(sliced, looseTldtsOpt); - if (!suffix) { - // This exclude domain-like resource like `_social_tracking.js^` + const parsed = tldts.parse(sliced, looseTldtsOpt); + + /** + * We can exclude wildcard in TLD + * + * ||example.* + * + * We can also exclude URL path pattern like this, since TLD and file extension don't overlapped + * + * -ad.css + * -ad.js + * + * This also exclude non standard TLD like `.tor`, `.onion`, `.dn42`, etc. + */ + if (!parsed.publicSuffix || !parsed.isIcann || !parsed.hostname || !parsed.domain) { result[1] = ParseType.Null; return result; } - const normalizer = white ? fastNormalizeDomain : fastNormalizeDomainWithoutWww; - const domain = normalizer(sliced); - - if (domain) { - result[0] = domain; - + // no wildcard, we can safely normalize it˝ + if (!parsed.hostname.includes('*')) { if (white) { + result[0] = parsed.hostname; result[1] = includeAllSubDomain ? ParseType.WhiteIncludeSubdomain : ParseType.WhiteAbsolute; - } else { - result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; + return result; } + + // blacklist, we can strip www from subdomain + if (parsed.subdomain) { + if (parsed.subdomain === 'www' || parsed.subdomain === 'xml-v4') { + result[0] = parsed.domain; + result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; + return result; + } + if (parsed.subdomain.startsWith('www.')) { + result[0] = parsed.subdomain.slice(4) + '.' + parsed.domain; + result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; + return result; + } + } + + result[0] = parsed.hostname; + result[1] = includeAllSubDomain ? ParseType.BlackIncludeSubdomain : ParseType.BlackAbsolute; return result; } result[0] = `[parse-filter E0010] (${white ? 'white' : 'black'}) invalid domain: ${JSON.stringify({ - line, domain, suffix, sliced, sliceStart, sliceEnd + line, sliced, sliceStart, sliceEnd, parsed })}`; result[1] = ParseType.ErrorMessage; return result;