From 72670e62432e91d2b505edb433fe03fcc3e77f37 Mon Sep 17 00:00:00 2001 From: SukkaW Date: Mon, 21 Oct 2024 17:53:49 +0800 Subject: [PATCH] Update Phishing Hosts building --- Build/build-reject-domainset.ts | 5 ++--- Build/constants/reject-data-source.ts | 15 ++++++--------- Build/lib/get-phishing-domains.ts | 24 +++++++++++++----------- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/Build/build-reject-domainset.ts b/Build/build-reject-domainset.ts index b71e4c3b..3acc807e 100644 --- a/Build/build-reject-domainset.ts +++ b/Build/build-reject-domainset.ts @@ -4,7 +4,7 @@ import process from 'node:process'; import { processHosts, processFilterRules, processDomainLists } from './lib/parse-filter'; -import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source'; +import { HOSTS, ADGUARD_FILTERS, PREDEFINED_WHITELIST, DOMAIN_LISTS, HOSTS_EXTRA, DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_EXTRA, PHISHING_DOMAIN_LISTS_EXTRA, ADGUARD_FILTERS_WHITELIST } from './constants/reject-data-source'; import { compareAndWriteFile } from './lib/create-file'; import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line'; import { task } from './trace'; @@ -47,8 +47,7 @@ export const buildRejectDomainSet = task(require.main === module, __filename)(as ...HOSTS_EXTRA.map(host => ` - ${host[0]}`), ...DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`), ...ADGUARD_FILTERS_EXTRA.map(filter => ` - ${Array.isArray(filter) ? filter[0] : filter}`), - ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`), - ...PHISHING_HOSTS_EXTRA.map(host => ` - ${host[0]}`) + ...PHISHING_DOMAIN_LISTS_EXTRA.map(domainList => ` - ${domainList[0]}`) ]); const appendArrayToRejectOutput = rejectOutput.addFromDomainset.bind(rejectOutput); diff --git a/Build/constants/reject-data-source.ts b/Build/constants/reject-data-source.ts index c406d473..748cfd8a 100644 --- a/Build/constants/reject-data-source.ts +++ b/Build/constants/reject-data-source.ts @@ -22,7 +22,12 @@ export const HOSTS_EXTRA: HostsSource[] = [ // Dan Pollock's hosts file, 0.0.0.0 version is 30 KiB smaller ['https://someonewhocares.org/hosts/zero/hosts', null, true, TTL.THREE_HOURS()], // ad-wars is not actively maintained, so we set a 7 days cache ttl - ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()] + ['https://raw.githubusercontent.com/jdlingyu/ad-wars/master/hosts', null, false, TTL.TWO_WEEKS()], + [ + 'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt', + [], + true, TTL.TWLVE_HOURS() + ] ]; export const DOMAIN_LISTS: HostsSource[] = [ @@ -97,14 +102,6 @@ export const PHISHING_DOMAIN_LISTS_EXTRA: HostsSource[] = [ ] ]; -export const PHISHING_HOSTS_EXTRA: HostsSource[] = [ - [ - 'https://raw.githubusercontent.com/durablenapkin/scamblocklist/master/hosts.txt', - [], - true, TTL.TWLVE_HOURS() - ] -]; - type AdGuardFilterSource = [main: string, mirrors: string[] | null, ttl: number, allowThirdParty?: boolean]; export const ADGUARD_FILTERS: AdGuardFilterSource[] = [ diff --git a/Build/lib/get-phishing-domains.ts b/Build/lib/get-phishing-domains.ts index 6e7ac720..75e203bd 100644 --- a/Build/lib/get-phishing-domains.ts +++ b/Build/lib/get-phishing-domains.ts @@ -1,10 +1,10 @@ -import { processDomainLists, processHosts } from './parse-filter'; +import { processDomainLists } from './parse-filter'; import * as tldts from 'tldts-experimental'; import { dummySpan } from '../trace'; import type { Span } from '../trace'; import { appendArrayInPlaceCurried } from './append-array-in-place'; -import { PHISHING_DOMAIN_LISTS_EXTRA, PHISHING_HOSTS_EXTRA } from '../constants/reject-data-source'; +import { PHISHING_DOMAIN_LISTS_EXTRA } from '../constants/reject-data-source'; import { loosTldOptWithPrivateDomains } from '../constants/loose-tldts-opt'; import picocolors from 'picocolors'; import createKeywordFilter from './aho-corasick'; @@ -153,7 +153,8 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: for (const apexDomain in domainCountMap) { if ( // !WHITELIST_MAIN_DOMAINS.has(apexDomain) - domainScoreMap[apexDomain] >= 16 + (domainScoreMap[apexDomain] >= 24) + || (domainScoreMap[apexDomain] >= 16 && domainCountMap[apexDomain] >= 4) || (domainScoreMap[apexDomain] >= 13 && domainCountMap[apexDomain] >= 7) || (domainScoreMap[apexDomain] >= 5 && domainCountMap[apexDomain] >= 10) || (domainScoreMap[apexDomain] >= 3 && domainCountMap[apexDomain] >= 16) @@ -162,6 +163,11 @@ const processPhihsingDomains = cache(function processPhihsingDomains(domainArr: } } + // console.log( + // domainScoreMap['wordpress.com'], + // domainCountMap['wordpress.com'] + // ); + return Promise.resolve(domainArr); }, { serializer: serializeArray, @@ -175,8 +181,6 @@ export function getPhishingDomains(parentSpan: Span) { (await Promise.all(PHISHING_DOMAIN_LISTS_EXTRA.map(entry => processDomainLists(curSpan, ...entry, cacheKey)))) .forEach(appendArrayInPlaceCurried(domainArr)); - (await Promise.all(PHISHING_HOSTS_EXTRA.map(entry => processHosts(curSpan, ...entry, cacheKey)))) - .forEach(appendArrayInPlaceCurried(domainArr)); return domainArr; }); @@ -205,9 +209,9 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub const subdomainLength = subdomain.length; - if (subdomainLength > 4) { - weight += 0.5; - if (subdomainLength > 10) { + if (subdomainLength > 6) { + weight += 0.25; + if (subdomainLength > 11) { weight += 0.6; if (subdomainLength > 20) { weight += 1; @@ -220,9 +224,7 @@ export function calcDomainAbuseScore(subdomain: string, fullDomain: string = sub } } - if (subdomain.startsWith('www.')) { - weight += 1; - } else if (subdomain.slice(1).includes('.')) { + if (subdomain.slice(1).includes('.')) { weight += 1; if (subdomain.includes('www.')) { weight += 1;