fix: handle URL of top domains without path

- ref #62, #43, #44
- 745c81b134, c623542b9a, 8923941376
were not effective previously
This commit is contained in:
MDLeom 2023-05-19 10:34:04 +00:00
parent a82bec41ed
commit 0578e6c16a
No known key found for this signature in database
GPG Key ID: 32D3E28E96A695E8
1 changed files with 9 additions and 3 deletions

View File

@ -213,9 +213,10 @@ while read URL; do
PATHNAME=$(echo "$URL" | sed "s/^$DOMAIN//")
if [ -z "$PATHNAME" ] || [ "$PATHNAME" = "/" ]; then
## Separate host-only URL
# Separate domain-only/no-path URL (e.g. "example.com/")
echo "$DOMAIN" | \
cut -f 1 -d ":" >> "phishing-notop-domains-temp.txt"
# Remove port
cut -f 1 -d ":" >> "phishing-subdomains.txt"
elif test "${URL#*safelinks.protection.outlook.com}" != "$URL"; then
## Parse hostname from O365 safelink
echo $(node "../src/safelinks.js" "$URL") >> "phishing-notop-domains-temp.txt"
@ -230,7 +231,12 @@ done < "phishing-url-top-domains-temp.txt"
## Re-enable command print
set -x
## "phishing-url-top-domains-temp.txt" may add duplicate entries
## "phishing-subdomains.txt" is derived from URLs of top domains that does not have a path
# exclude from top (sub)domains
cat "phishing-subdomains.txt" | \
grep -Fx -vf "phishing-top-domains.txt" >> "phishing-notop-domains-temp.txt"
## "phishing-subdomains.txt" & "phishing-url-top-domains-temp.txt" may add duplicate entries
sort -u "phishing-notop-domains-temp.txt" > "phishing-notop-domains.txt"