From 99053ecd32630e85eab61714f16a7dd404a404c9 Mon Sep 17 00:00:00 2001 From: xeruf <27jf@pm.me> Date: Tue, 31 Jan 2023 18:13:32 +0100 Subject: [PATCH] bin: extend scripts for duplicate detection --- .local/bin/scripts/dupcheck | 17 +++++++++++++++++ .local/bin/scripts/duploc | 12 ++++++++---- 2 files changed, 25 insertions(+), 4 deletions(-) create mode 100755 .local/bin/scripts/dupcheck diff --git a/.local/bin/scripts/dupcheck b/.local/bin/scripts/dupcheck new file mode 100755 index 0000000..64bd8fd --- /dev/null +++ b/.local/bin/scripts/dupcheck @@ -0,0 +1,17 @@ +#!/bin/sh +# Check the first file against all other ones given and print duplicates. +# Checks first size then diff. +# TODO diff initial bytes +if ! test $# -lt 2 +then echo "Need at least 2 files to compare!" 2>/dev/null + exit 2 +fi +target="$1" +shift +for existing +do test "$(realpath "$target")" != "$(realpath "$existing")" -a -f "$existing" && + test "$(stat --format %s "$target")" -eq "$(stat --format %s "$existing")" && + diff -q "$target" "$existing" >/dev/null && + echo "$existing" + #test "$(md5sum "$existing" | cut -d\ -f1)" = "$(md5sum "$filepath" | cut -d\ -f1)" && +done diff --git a/.local/bin/scripts/duploc b/.local/bin/scripts/duploc index e01c9f8..0014d3b 100755 --- a/.local/bin/scripts/duploc +++ b/.local/bin/scripts/duploc @@ -1,11 +1,15 @@ #!/bin/sh -# Delete files under current or given path which exist elsewhere as listed in the locate database -# Matches first by name, then by checksum (currently inefficiently via md5) +# Delete files under current or given path +# which exist elsewhere as listed by locate. +# Args: [threshold (MB)] +# OptDepends: synct (for checking against original filename) case $1 in ([0-9]|[0-9][0-9]) threshold=$1; shift;; esac find "$@" -size +${threshold:-50}M -type f -exec sh -c "IFS=$'\n'"' filepath="{}" - target="$(synct-unarchive "$filepath")" + target="$(synct-unarchive "$filepath" || echo "$filepath")" highlight "$filepath" for existing in $(locate -b "$target") - do test "$(realpath "$target")" != "$(realpath "$existing")" -a -f "$existing" && test "$(md5sum "$existing" | cut -d\ -f1)" = "$(md5sum "$filepath" | cut -d\ -f1)" && echo "Found duplicate at $existing" && rm -vi "$filepath" && break + do test -n "$(dupcheck "$target" "$existing") && + echo "Duplicate found at $existing" + rm -vi "$filepath" && break done' \;