Feat: update phishing domain threshold

This commit is contained in:
SukkaW 2024-05-19 03:58:00 +08:00
parent 6380d0be5a
commit b5a6e05a84
4 changed files with 95 additions and 63 deletions

View File

@ -11,7 +11,6 @@ import createKeywordFilter from './lib/aho-corasick';
import { readFileByLine, readFileIntoProcessedArray } from './lib/fetch-text-by-line';
import { sortDomains } from './lib/stable-sort-domain';
import { task } from './trace';
import { getGorhillPublicSuffixPromise } from './lib/get-gorhill-publicsuffix';
import * as tldts from 'tldts';
import { SHARED_DESCRIPTION } from './lib/constants';
import { getPhishingDomains } from './lib/get-phishing-domains';

View File

@ -0,0 +1,10 @@
// eslint-disable-next-line import-x/no-unresolved -- bun
import { describe, expect, it } from 'bun:test';
import { calcDomainAbuseScore } from './get-phishing-domains';
describe('sortDomains', () => {
it('nmdj.pl', () => {
console.log(calcDomainAbuseScore('.01462ccca801fed55370d79231c876e5.nmdj.pl'));
});
});

View File

@ -103,21 +103,23 @@ const BLACK_TLD = new Set([
]);
export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('get phishing domains').traceAsyncFn(async (span) => {
const [domainSet, domainSet2, gorhill] = await Promise.all([
processDomainLists(span, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
isCI
? processDomainLists(span, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
: null,
getGorhillPublicSuffixPromise()
]);
if (domainSet2) {
const gorhill = await getGorhillPublicSuffixPromise();
const domainSet = await span.traceChildAsync('download/parse/merge phishing domains', async (curSpan) => {
const [domainSet, domainSet2] = await Promise.all([
processDomainLists(curSpan, 'https://curbengh.github.io/phishing-filter/phishing-filter-domains.txt', true, TTL.THREE_HOURS()),
processDomainLists(curSpan, 'https://phishing.army/download/phishing_army_blocklist.txt', true, TTL.THREE_HOURS())
]);
SetAdd(domainSet, domainSet2);
}
span.traceChildSync('whitelisting phishing domains', (parentSpan) => {
const trieForRemovingWhiteListed = parentSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
return domainSet;
});
return parentSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
span.traceChildSync('whitelisting phishing domains', (curSpan) => {
const trieForRemovingWhiteListed = curSpan.traceChildSync('create trie for whitelisting', () => createTrie(domainSet));
return curSpan.traceChild('delete whitelisted from domainset').traceSyncFn(() => {
for (let i = 0, len = WHITELIST_DOMAIN.length; i < len; i++) {
const white = WHITELIST_DOMAIN[i];
domainSet.delete(white);
@ -134,68 +136,28 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
const domainArr = Array.from(domainSet);
for (let i = 0, len = domainArr.length; i < len; i++) {
const line = processLine(domainArr[i]);
if (!line) continue;
const line = domainArr[i];
const apexDomain = gorhill.getDomain(line);
if (!apexDomain) continue;
const safeGorhillLine = line[0] === '.' ? line.slice(1) : line;
domainCountMap[apexDomain] ||= 0;
const isPhishingDomainMockingCoJp = line.includes('-co-jp');
if (isPhishingDomainMockingCoJp) {
domainCountMap[apexDomain] += 0.5;
const apexDomain = gorhill.getDomain(safeGorhillLine);
if (!apexDomain) {
console.log({ line });
continue;
}
if (line.startsWith('.amaz')) {
domainCountMap[apexDomain] += 0.5;
if (line.startsWith('.amazon-')) {
domainCountMap[apexDomain] += 4.5;
}
if (isPhishingDomainMockingCoJp) {
domainCountMap[apexDomain] += 4;
}
} else if (line.startsWith('.customer')) {
domainCountMap[apexDomain] += 0.25;
}
const tld = gorhill.getPublicSuffix(line[0] === '.' ? line.slice(1) : line);
const tld = gorhill.getPublicSuffix(safeGorhillLine);
if (!tld || !BLACK_TLD.has(tld)) continue;
// Only when tld is black will this 1 weight be added
domainCountMap[apexDomain] += 1;
const lineLen = line.length;
if (lineLen > 19) {
// Add more weight if the domain is long enough
if (lineLen > 44) {
domainCountMap[apexDomain] += 3.5;
} else if (lineLen > 34) {
domainCountMap[apexDomain] += 2.5;
} else if (lineLen > 29) {
domainCountMap[apexDomain] += 1.5;
} else if (lineLen > 24) {
domainCountMap[apexDomain] += 0.75;
} else {
domainCountMap[apexDomain] += 0.25;
}
if (domainCountMap[apexDomain] < 5) {
const subdomain = tldts.getSubdomain(line, { detectIp: false });
if (subdomain?.includes('.')) {
domainCountMap[apexDomain] += 1.5;
}
}
}
domainCountMap[apexDomain] ||= 0;
domainCountMap[apexDomain] += calcDomainAbuseScore(line);
}
});
const results = span.traceChildSync('get final phishing results', () => {
const res: string[] = [];
for (const domain in domainCountMap) {
if (domainCountMap[domain] >= 5) {
if (domainCountMap[domain] >= 8) {
res.push(`.${domain}`);
}
}
@ -204,3 +166,61 @@ export const getPhishingDomains = (parentSpan: Span) => parentSpan.traceChild('g
return [results, domainSet] as const;
});
export function calcDomainAbuseScore(line: string) {
let weight = 1;
const isPhishingDomainMockingCoJp = line.includes('-co-jp');
if (isPhishingDomainMockingCoJp) {
weight += 0.5;
}
if (line.startsWith('.amaz')) {
weight += 0.5;
if (line.startsWith('.amazon-')) {
weight += 4.5;
}
if (isPhishingDomainMockingCoJp) {
weight += 4;
}
} else if (line.includes('.customer')) {
weight += 0.25;
}
const lineLen = line.length;
if (lineLen > 19) {
// Add more weight if the domain is long enough
if (lineLen > 44) {
weight += 3.5;
} else if (lineLen > 34) {
weight += 2.5;
} else if (lineLen > 29) {
weight += 1.5;
} else if (lineLen > 24) {
weight += 0.75;
} else {
weight += 0.25;
}
}
const subdomain = tldts.getSubdomain(line, { detectIp: false });
if (subdomain) {
if (subdomain.slice(1).includes('.')) {
weight += 1;
}
if (subdomain.length > 40) {
weight += 3;
} else if (subdomain.length > 30) {
weight += 1.5;
} else if (subdomain.length > 20) {
weight += 1;
} else if (subdomain.length > 10) {
weight += 0.1;
}
}
return weight;
}

View File

@ -302,6 +302,7 @@ inst.360safe.com
.pages.net.br
.myenotice.com
.eu5.net
.jdie.pl
# --- AD Block ---
@ -733,6 +734,8 @@ comments.gazo.space
.footprintdns.com
.measure.office.com
.opinionjet.com
# >> Tracking
.mktg.tags.f5.com
.trk.caseads.com