#!/bin/sh # duplicates - find duplicate files # # http://www.tecneeq.de/files/shell/duplicates/ # # TODO: # - accept directorys from commandline instead of just using ./ # - accept a list of files to compare on stdin # - if we have only two files of the same size we should use cmp, # wich is faster than md5sum because it stops comparing at the # first difference, while md5sum reads the whole file. Problem # is when we have more than two files with the same size, cmp # would have to read some files more than once, wich gives the # advantage back to md5sum. # - write manpage # - make more or less portable (BSD,OSX,Solaris,Linux) # - create pkg and submit to pkgsrc-team # Copyright (c) 2008 Karsten Kruse # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # 3. Neither the name of the author nor the names of its contributors # may be used to endorse or promote products derived from this # software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. set -e set -u #set -x # init some vars we use later my_name=$(basename $0) my_version="03" number_to_consider="0" my_tempdir="${TMPDIR:=/tmp}/${my_name}_$$" foundfiles=${my_tempdir}/allfiles.txt considerfiles=${my_tempdir}/consider.txt md5sums=${my_tempdir}/md5sums.txt originals=${my_tempdir}/unique.txt deletable=${my_tempdir}/delete.txt GIVEN_DIR=unset BE_QUIET=unset DELETE_DUPES=unset HARDLINK_DUPES=unset MOVE_DUPES=unset NO_FILEOPS=unset nofileops="" outputcmd=echo verboseopt="-v" func_usage() { my_name=$(basename $0) cat < print this help -d => search in instead of current working directory -r => find duplicates and remove them -l => find duplicates and hardlink them -m => find duplicates and move them to -q => be quit -x => no file operations, used to see what -d, -l or -m would do -v => print version Examples: $my_name => list duplicates in current directory $my_name -d /mnt => list duplicates in /mnt $my_name -m /tmp => move duplicates to /tmp $my_name -m /tmp -l => move duplicates to /tmp and create hardlinks $my_name -l => hardlink duplicates $my_name -r => just remove duplicates $my_name -m /tmp -x => debug the -m switch END } func_testdir() { if [ ! -d "$2" ] ; then echo >&2 "The directory you specified with $1 does not exist." exit 1 fi } # parse commandline while getopts hd:rlm:qxv option ; do case "$option" in h) func_usage ; exit ;; d) GIVEN_DIR=set ; SEARCHDIR="$OPTARG" ; func_testdir -d "$SEARCHDIR" ;; r) DELETE_DUPES=set ;; l) HARDLINK_DUPES=set ;; m) MOVE_DUPES=set MOVEDIR="$OPTARG" ; func_testdir -m "$MOVEDIR";; q) BE_QUIET=set ;; x) NO_FILEOPS=set ;; v) echo $my_name $my_version ; exit ;; \?) func_usage >&2 ; exit 1 ;; esac done shift $(expr $OPTIND - 1) # make sure tempfiles are removed at program termination or # after we received a signal: trap 'rm -rf "$my_tempdir" >/dev/null 2>&1' 0 trap "exit 2" 1 2 3 6 13 15 mkdir $my_tempdir # we debug a command by putting echo in front of it if [ $NO_FILEOPS = set ] ; then # add verbosity by unsetting BE_QUIET BE_QUIET=unset nofileops=echo fi # if quietness is requested we override "echo" commands with ":" and "-v" # arguments with "" if [ $BE_QUIET = set ] ; then outputcmd=":" verboseopt="" fi $outputcmd -n "Searching files (this filesystem only) ... " if [ ! $GIVEN_DIR = set ] ; then SEARCHDIR="." fi { find $SEARCHDIR -links +1 -type f ! -type d -printf "%12i%12s\t%p\n" \ | sort -k 1.1,1.12 -n | uniq -w 12 | cut -c 13- find $SEARCHDIR -mount -type f -a -links 1 -printf "%12s\t%p\n" } > $foundfiles $outputcmd "$(wc -l < $foundfiles) files to consider." $outputcmd -n "Sorting files by size, removing uniques ... " sort -k 1.1,1.12 -n < $foundfiles | uniq -D -w 12 > $considerfiles number_to_consider=$(wc -l < $considerfiles) if [ $number_to_consider -lt 2 ] ; then $outputcmd "all files unique. Exit." exit else $outputcmd "$number_to_consider files left." fi # cut out filename, replace newline, calculate md5, replace spaces with tab # and feed to sort $outputcmd -n "Calculating md5sums (this takes time) ... " cut -f 2 $considerfiles \ | tr "\n" "\000" \ | xargs -0 md5sum \ | sed 's/ /\t/' \ | sort > $md5sums $outputcmd "$(wc -l < $md5sums) md5sums calculated." $outputcmd -n "Generating list of files to delete ... " # only one of the dupes can be the original uniq -w 32 -d $md5sums > $originals uniq -w 32 -D $md5sums \ | comm -3 - $originals > $deletable number_to_delete=$(wc -l < $deletable) $outputcmd "$number_to_delete files are duplicates." [ $number_to_delete = 0 ] && exit # print duplicates if [ ! "$BE_QUIET" = set ] ; then join -t $'\t' $originals $deletable \ | cut -f 2- \ | tr "\t" " " \ | sed 's/^/duplicates: /' fi # let's start file operations if [ $MOVE_DUPES = set ] ; then DELETE_DUPES=unset # mv leaves no duplicates, so we can skip deletion $outputcmd "Moving duplicates to $MOVEDIR ..." cut -f 2- $deletable \ | tr "\n" "\000" \ | xargs -0 -n 50 $nofileops mv $verboseopt --target-directory="$MOVEDIR" $outputcmd "done." fi if [ $HARDLINK_DUPES = set ] ; then DELETE_DUPES=unset # ln leaves no duplicates, so we can skip deletion $outputcmd "Creating hardlinks ... " join -t $'\t' $originals $deletable \ | cut -f 2- \ | tr "\n\t" "\000\000" \ | xargs -0 -n 2 $nofileops ln $verboseopt -f $outputcmd "done." fi if [ $DELETE_DUPES = set ] ; then $outputcmd "Deleting duplicates ... " cut -f 2- $deletable \ | tr "\n" "\000" \ | xargs -0 $nofileops rm $verboseopt $outputcmd "done." fi # eof