#!/usr/bin/env sh # Description: List non-empty duplicates in the current dir (based on size followed by MD5) # # Source: https://www.commandlinefu.com/commands/view/3555/find-duplicate-files-based-on-size-first-then-md5-hash # # Dependencies: find md5sum sort uniq xargs gsed # # Notes: # 1. If the file size exceeds $size_digits digits the file will be misplaced # 12 digits fit files up to 931GiB # 2. Bash compatible required for mktemp # # Shell: Bash # Authors: syssyphus, KlzXS EDITOR="${EDITOR:-vi}" TMPDIR="${TMPDIR:-/tmp}" size_digits=12 tmpfile=$(mktemp "$TMPDIR/.nnnXXXXXX") printf "\ ## This is an overview of all duplicate files found. ## Comment out the files you wish to remove. You will be given an option to cancel. ## Lines with double comments (##) are ignored. ## You will have the option to remove the files with force or interactively.\n " > "$tmpfile" # shellcheck disable=SC2016 find . -size +0 -type f -printf "%${size_digits}s %p\n" | sort -rn | uniq -w"${size_digits}" -D | sed -e ' s/^ \{0,12\}\([0-9]\{0,12\}\) \(.*\)$/printf "%s %s\\n" "$(md5sum "\2")" "d\1"/ ' | tr '\n' '\0' | xargs -0 -n1 sh -c | sort | { uniq -w32 --all-repeated=separate; echo; } | sed -ne ' h s/^\(.\{32\}\).* d\([0-9]*\)$/## md5sum: \1 size: \2 bytes/p g :loop N /.*\n$/!b loop p' | sed -e 's/^.\{32\} \(.*\) d[0-9]*$/\1/' >> "$tmpfile" "$EDITOR" "$tmpfile" printf "Remove commented files? (yes/no) [default=n]: " read -r commented if [ "$commented" = "y" ]; then sedcmd="/^##.*/d; /^[^#].*/d; /^$/d; s/^# *\(.*\)$/\1/" else printf "Press any key to exit" read -r _ exit fi printf "Remove with force or interactive? (f/i) [default=i]: " read -r force if [ "$force" = "f" ]; then #shellcheck disable=SC2016 sed -e "$sedcmd" "$tmpfile" | tr '\n' '\0' | xargs -0 -r sh -c 'rm -f "$0" "$@"