From ca9415ecc6a77fbee6e2956442bdedb8cc9dc76b Mon Sep 17 00:00:00 2001 From: SukkaW Date: Sun, 19 Jan 2025 12:33:27 +0800 Subject: [PATCH] Phishing domains trim `www` --- Build/lib/get-phishing-domains.ts | 2 +- Build/lib/normalize-domain.ts | 18 ++++++++++++ Build/lib/parse-filter/domainlists.ts | 40 +++++++++++++++------------ 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 99a073a9..2f5c0408 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -208,7 +208,7 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: }); const downloads = [ - ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry)), + ...PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainListsWithPreload(...entry, true)), ...PHISHING_HOSTS_EXTRA.map(entry => processHostsWithPreload(...entry)) ]; diff --git a/Build/lib/normalize-domain.ts b/Build/lib/normalize-domain.ts index 8837ae45..2e18f5e5 100644 --- a/Build/lib/normalize-domain.ts +++ b/Build/lib/normalize-domain.ts @@ -24,6 +24,24 @@ export function fastNormalizeDomain(domain: string, parsed: TldTsParsed | null = return parsed.hostname; } +export function fastNormalizeDomainIgnoreWww(domain: string, parsed: TldTsParsed | null = null) { + // We don't want tldts to call its own "extractHostname" on ip, bail out ip first. + // Now ip has been bailed out, we can safely set normalizeTldtsOpt.detectIp to false. + if (isProbablyIpv4(domain) || isProbablyIpv6(domain)) { + return null; + } + + parsed ??= tldts.parse(domain, normalizeTldtsOpt); + + // Private invalid domain (things like .tor, .dn42, etc) + if (!parsed.isIcann && !parsed.isPrivate) return null; + + if (parsed.subdomain === 'www') { + return parsed.domain; + } + return parsed.hostname; +} + export function normalizeDomain(domain: string, parsed: TldTsParsed | null = null) { if (domain.length === 0) return null; diff --git a/Build/lib/parse-filter/domainlists.ts b/Build/lib/parse-filter/domainlists.ts index 428c73a0..531084aa 100644 --- a/Build/lib/parse-filter/domainlists.ts +++ b/Build/lib/parse-filter/domainlists.ts @@ -1,36 +1,35 @@ -import picocolors from 'picocolors'; -import { fastNormalizeDomain } from '../normalize-domain'; +import { fastNormalizeDomain, fastNormalizeDomainIgnoreWww } from '../normalize-domain'; import { processLine } from '../process-line'; import { onBlackFound } from './shared'; import { fetchAssets } from '../fetch-assets'; import type { Span } from '../../trace'; -function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string) { +function domainListLineCb(l: string, set: string[], includeAllSubDomain: boolean, meta: string, normalizeDomain = fastNormalizeDomain) { const line = processLine(l); if (!line) return; - const domain = fastNormalizeDomain(line); - if (!domain) return; - if (domain !== line) { - console.log( - picocolors.red('[process domain list]'), - picocolors.gray(`line: ${line}`), - picocolors.gray(`domain: ${domain}`), - picocolors.gray(meta) - ); - + const domain = normalizeDomain(line); + if (!domain) { + // console.log( + // picocolors.red('[process domain list]'), + // picocolors.gray(`line: ${line}`), + // picocolors.gray(`domain: ${domain}`), + // picocolors.gray(meta) + // ); return; } onBlackFound(domain, meta); - set.push(includeAllSubDomain ? `.${line}` : line); + set.push(includeAllSubDomain ? `.${domain}` : domain); } export function processDomainLists( span: Span, - domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false + domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false, wwwToApex = false ) { + const domainNormalizer = wwwToApex ? fastNormalizeDomainIgnoreWww : fastNormalizeDomain; + return span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { const text = await span.traceChildAsync('download', () => fetchAssets( domainListsUrl, @@ -41,7 +40,7 @@ export function processDomainLists( span.traceChildSync('parse domain list', () => { for (let i = 0, len = filterRules.length; i < len; i++) { - domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); + domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl, domainNormalizer); } }); @@ -49,7 +48,12 @@ export function processDomainLists( }); } -export function processDomainListsWithPreload(domainListsUrl: string, mirrors: string[] | null, includeAllSubDomain = false) { +export function processDomainListsWithPreload( + domainListsUrl: string, mirrors: string[] | null, + includeAllSubDomain = false, wwwToApex = false +) { + const domainNormalizer = wwwToApex ? fastNormalizeDomainIgnoreWww : fastNormalizeDomain; + const downloadPromise = fetchAssets(domainListsUrl, mirrors); return (span: Span) => span.traceChildAsync(`process domainlist: ${domainListsUrl}`, async (span) => { @@ -59,7 +63,7 @@ export function processDomainListsWithPreload(domainListsUrl: string, mirrors: s span.traceChildSync('parse domain list', () => { for (let i = 0, len = filterRules.length; i < len; i++) { - domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl); + domainListLineCb(filterRules[i], domainSets, includeAllSubDomain, domainListsUrl, domainNormalizer); } });