diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 2f5c0408..99a073a9 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -208,7 +208,7 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: }); const downloads = [ - ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry, true)), + ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)), ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry)) ]; diff --git a/Build/lib/normalize-domain.ts b/Build/lib/normalize-domain.ts index 2e18f5e5..32c7c99c 100644 --- a/Build/lib/normalize-domain.ts +++ b/Build/lib/normalize-domain.ts @@ -7,6 +7,32 @@ import { isProbablyIpv4, isProbablyIpv6 } from 'foxts/is-probably-ip'; type TldTsParsed = ReturnType; +/** + * Skipped the input non-empty check, the `domain` should not be empty. + */ +export function fastNormalizeDomainWithoutWww(domain: string, parsed: TldTsParsed | null = null) { + // We don't want tldts to call its own "extractHostname" on ip, bail out ip first. + // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. + if (isProbablyIpv4(domain) || isProbablyIpv6(domain)) { + return null; + } + + parsed ??= tldts.parse(domain, normalizeTldtsOpt); + // Private invalid domain (things like .tor, .dn42, etc) + if (!parsed.isIcann && !parsed.isPrivate) return null; + + if (parsed.subdomain) { + if (parsed.subdomain === 'www') { + return parsed.domain; + } + if (parsed.subdomain.startsWith('www.')) { + return parsed.subdomain.slice(4) + '.' + parsed.domain; + } + } + + return parsed.hostname; +} + /** * Skipped the input non-empty check, the `domain` should not be empty. */ @@ -24,24 +50,6 @@ export function fastNormalizeDomain(domain: string, parsed: TldTsParsed | null = return parsed.hostname; } -export function fastNormalizeDomainIgnoreWww(domain: string, parsed: TldTsParsed | null = null) { - // We don't want tldts to call its own "extractHostname" on ip, bail out ip first. - // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. - if (isProbablyIpv4(domain) || isProbablyIpv6(domain)) { - return null; - } - - parsed ??= tldts.parse(domain, normalizeTldtsOpt); - - // Private invalid domain (things like .tor, .dn42, etc) - if (!parsed.isIcann && !parsed.isPrivate) return null; - - if (parsed.subdomain === 'www') { - return parsed.domain; - } - return parsed.hostname; -} - export function normalizeDomain(domain: string, parsed: TldTsParsed | null = null) { if (domain.length === 0) return null; diff --git a/Build/lib/parse-filter/domainlists.ts b/Build/lib/parse-filter/domainlists.ts index 834c6c1b..d3007df2 100644 --- a/Build/lib/parse-filter/domainlists.ts +++ b/Build/lib/parse-filter/domainlists.ts @@ -1,4 +1,4 @@ -import { fastNormalizeDomain, fastNormalizeDomainIgnoreWww } from '../normalize-domain'; +import { fastNormalizeDomain, fastNormalizeDomainWithoutWww } from '../normalize-domain'; import { processLine } from '../process-line'; import { onBlackFound } from './shared'; import { fetchAssets } from '../fetch-assets'; @@ -27,9 +27,8 @@ function domainListLineCbIncludeAllSubdomain(line: string, set: string[], meta: export function processDomainLists( span: Span, - domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, wwwToApex = false + domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false ) { - const domainNormalizer = wwwToApex ? fastNormalizeDomainIgnoreWww : fastNormalizeDomain; const lineCb = includeAllSubDomain ? domainListLineCbIncludeAllSubdomain : domainListLineCb; return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { @@ -42,7 +41,7 @@ export function processDomainLists( span.traceChildSync('parse domain list', () => { for (let i = 0, len = filterRules.length; i < len; i++) { - lineCb(filterRules[i], domainSets, domainListsUrl, domainNormalizer); + lineCb(filterRules[i], domainSets, domainListsUrl, fastNormalizeDomainWithoutWww); } }); @@ -52,10 +51,8 @@ export function processDomainLists( export function processDomainListsWithPreload( domainListsUrl: string, mirrors: string[] | null, - includeAllSubDomain = false, wwwToApex = false + includeAllSubDomain = false ) { - const domainNormalizer = wwwToApex ? fastNormalizeDomainIgnoreWww : fastNormalizeDomain; - const downloadPromise = fetchAssets(domainListsUrl, mirrors, true); const lineCb = includeAllSubDomain ? domainListLineCbIncludeAllSubdomain : domainListLineCb; @@ -65,7 +62,7 @@ export function processDomainListsWithPreload( span.traceChildSync('parse domain list', () => { for (let i = 0, len = filterRules.length; i < len; i++) { - lineCb(filterRules[i], domainSets, domainListsUrl, domainNormalizer); + lineCb(filterRules[i], domainSets, domainListsUrl, fastNormalizeDomainWithoutWww); } }); diff --git a/Build/lib/parse-filter/filters.ts b/Build/lib/parse-filter/filters.ts index 892cd217..cfbd4f18 100644 --- a/Build/lib/parse-filter/filters.ts +++ b/Build/lib/parse-filter/filters.ts @@ -3,10 +3,10 @@ import type { Span } from '../../trace'; import { fetchAssets } from '../fetch-assets'; import { onBlackFound, onWhiteFound } from './shared'; import { createRetrieKeywordFilter as createKeywordFilter } from 'foxts/retrie'; -import { fastNormalizeDomain } from '../normalize-domain'; import { looseTldtsOpt } from '../../constants/loose-tldts-opt'; import tldts from 'tldts-experimental'; import { NetworkFilter } from '@ghostery/adblocker'; +import { fastNormalizeDomainWithoutWww } from '../normalize-domain'; const enum ParseType { WhiteIncludeSubdomain = 0, @@ -221,7 +221,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa && filter.isPlain() // isPlain() === !isRegex() && (!filter.isFullRegex()) ) { - const hostname = fastNormalizeDomain(filter.hostname); + const hostname = fastNormalizeDomainWithoutWww(filter.hostname); if (!hostname) { result[1] = ParseType.Null; return result; @@ -436,7 +436,7 @@ export function parse($line: string, result: [string, ParseType], includeThirdPa return result; } - const domain = fastNormalizeDomain(sliced); + const domain = fastNormalizeDomainWithoutWww(sliced); if (domain && domain === sliced) { result[0] = domain; diff --git a/Build/lib/parse-filter/hosts.ts b/Build/lib/parse-filter/hosts.ts index 865c8f87..d551db7a 100644 --- a/Build/lib/parse-filter/hosts.ts +++ b/Build/lib/parse-filter/hosts.ts @@ -1,6 +1,6 @@ import type { Span } from '../../trace'; import { fetchAssets } from '../fetch-assets'; -import { fastNormalizeDomain } from '../normalize-domain'; +import { fastNormalizeDomainWithoutWww } from '../normalize-domain'; import { onBlackFound } from './shared'; function hostsLineCb(line: string, set: string[], includeAllSubDomain: boolean, meta: string) { @@ -8,7 +8,7 @@ function hostsLineCb(line: string, set: string[], includeAllSubDomain: boolean, if (!_domain) { return; } - const domain = fastNormalizeDomain(_domain); + const domain = fastNormalizeDomainWithoutWww(_domain); if (!domain) { return; }