#!/usr/bin/perl # # linkdups - Find duplicate files in a directory tree and link() them. # # Copyright (C) 2000-2008 Steven Pritchard # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of # the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. # # $Id: linkdups,v 1.23 2008/02/28 21:22:19 steve Exp $ use strict; use warnings; use Getopt::Std; use Digest::MD5; use FileHandle; use DirHandle; use File::Basename; # Work around *stupid* File::Spec::Unix bug. if (!$^V or $^V lt v5.10.0) { no warnings; package File::Spec::Unix; sub _cwd { require Cwd; Cwd::getcwd(); } } # END STUPID WORKAROUND. sub usage($); sub info(@); sub debug(@); sub recurse_into($); sub stuff($); sub checksum($$); our @statbuf=(); our $total=0; our $numtotal=0; our $num=0; our $start=0; our $pass1=0; our %files=(); # $data{$device}->{$fsize}->{$inode}={ # 'links' => $links, # 'files' => [ @files ], # 'mtime' => $mtime, # optional # } our %data=(); our %opt; getopts('rvDnth', \%opt) or usage(1); usage(0) if $opt{'h'}; our $recursive = defined($opt{'r'}) ? 1 : 0; our $verbose = defined($opt{'v'}) ? 1 : 0; our $debug = defined($opt{'D'}) ? 1 : 0; our $dryrun = defined($opt{'n'}) ? 1 : 0; our $usetimes = defined($opt{'t'}) ? 1 : 0; $SIG{'USR1'}=sub { print STDERR "files found: $numtotal, files checked: $num, bytes " . ($dryrun ? "recoverable" : "saved") . ": $total\n"; }; if ($verbose or $debug) { eval { use Time::HiRes qw(gettimeofday tv_interval); }; $start=[gettimeofday] if (!$@); } my @files; if (@ARGV) { @files=@ARGV; } else { if ($recursive) { @files=("."); } else { usage(1); } } # Pass 1 - stat() everything. for my $file (@files) { if (!(@statbuf=lstat($file))) { warn basename($0), ": $file: $!\n"; next; } if ($recursive and (-d(_))) { recurse_into($file); } elsif (-f(_)) { stuff($file); } else { warn "$file: not a regular file\n"; next; } } if ($verbose or $debug) { my $elapsed; if ($start) { $pass1=[gettimeofday]; $elapsed=tv_interval($start, $pass1); } else { $pass1=time; $elapsed=$pass1-$^T; } info "$numtotal files scanned in $elapsed seconds.\n"; } our $context=new Digest::MD5; # Pass 2 - Check for files with the same size and md5sum them. for my $dev (keys(%data)) { for my $size (sort { $b <=> $a } keys(%{$data{$dev}})) { my @inodes=keys(%{$data{$dev}->{$size}}); if ($usetimes) { my %t; for my $i (@inodes) { push(@{$t{$data{$dev}->{$size}->{$i}->{'mtime'}}}, $i); } @inodes=grep { @{$t{$data{$dev}->{$size}->{$_}->{'mtime'}}} > 1 } @inodes; } next unless (@inodes > 1); debug "Checking " . scalar(@inodes) . " files of size $size...\n"; for my $inode (@inodes) { checksum($context, $data{$dev}->{$size}->{$inode}); } } } info "$num files checked in "; if ($start) { info tv_interval($pass1); } else { info time-$pass1; } info " seconds ($numtotal files scanned).\n"; # Pass 3 - Link duplicates. for my $dev (keys(%data)) { for my $size (sort { $b <=> $a } keys(%{$data{$dev}})) { my %dup=(); if ($usetimes) { for my $inode (keys(%{$data{$dev}->{$size}})) { next if (!exists($data{$dev}->{$size}->{$inode}->{'md5sum'})); my $mtime=$data{$dev}->{$size}->{$inode}->{'mtime'}; my $md5sum=$data{$dev}->{$size}->{$inode}->{'md5sum'}; push(@{$dup{"$mtime:$md5sum"}}, $inode); } } else { for my $inode (keys(%{$data{$dev}->{$size}})) { next if (!exists($data{$dev}->{$size}->{$inode}->{'md5sum'})); push(@{$dup{$data{$dev}->{$size}->{$inode}->{'md5sum'}}}, $inode); } } for my $hash (keys(%dup)) { next unless (@{$dup{$hash}} > 1); debug "Linking " . scalar(@{$dup{$hash}}) . " duplicates (" . scalar(map { @{$data{$dev}->{$size}->{$_}->{'files'}} } @{$dup{$hash}}) . " files) of size $size\n"; my @inodes=sort { $data{$dev}->{$size}->{$b}->{'links'} <=> $data{$dev}->{$size}->{$a}->{'links'} } @{$dup{$hash}}; my $master=shift @inodes; my $source=$data{$dev}->{$size}->{$master}->{'files'}->[0]; for my $inode (@inodes) { debug "$dev:$master (" . $data{$dev}->{$size}->{$master}->{'links'} . " links) -> $dev:$inode (" . $data{$dev}->{$size}->{$inode}->{'links'} . " links)\n"; for my $file (@{$data{$dev}->{$size}->{$inode}->{'files'}}) { if ($dryrun) { debug "unlinking $file...\n"; debug "linking $source to $file...\n"; } else { debug "unlinking $file...\n"; if (unlink($file)) { debug "linking $source to $file...\n"; link($source, $file) or die "link($source,$file) failed: $!\n"; } else { warn "unlink($file) failed: $!\n"; next; } } $data{$dev}->{$size}->{$master}->{'links'}++; } if (!$dryrun) { @statbuf=stat($source); warn "Actual link count != expected!\n" if ($data{$dev}->{$size}->{$master}->{'links'} != $statbuf[3]); } $total+=$size; } } } } info "$total bytes "; info "would be " if ($dryrun); info "saved.\n"; sub stuff($) { my $file=shift; $numtotal++; if (exists($data{$statbuf[0]}) and exists($data{$statbuf[0]}->{$statbuf[7]}) and exists($data{$statbuf[0]}->{$statbuf[7]}->{$statbuf[1]})) { push(@{$data{$statbuf[0]}->{$statbuf[7]}->{$statbuf[1]}->{'files'}}, $file); } $data{$statbuf[0]}->{$statbuf[7]}->{$statbuf[1]}={ #'device' => $statbuf[0], #'inode' => $statbuf[1], #'size' => $statbuf[7], 'links' => $statbuf[3], 'files' => [ $file ], }; $data{$statbuf[0]}->{$statbuf[7]}->{$statbuf[1]}->{'mtime'}=$statbuf[9] if ($usetimes); $data{$statbuf[0]}->{$statbuf[7]}->{$statbuf[1]}->{'md5sum'}="d41d8cd98f00b204e9800998ecf8427e" if ($statbuf[7] == 0); return 1; } sub checksum($$) { my ($context,$i)=@_; return if (exists($i->{'md5sum'})); # Pick the first file in the list and md5sum that. my $file=$i->{'files'}->[0]; my $fh=new FileHandle $file, "r"; if (!defined($fh)) { warn "Failed to open $file: $!\n"; return undef; } $num++; $context->reset(); $context->addfile($fh); $fh->close(); my $hash=$context->hexdigest(); debug "$hash $file\n"; $i->{'md5sum'}=$hash; } sub recurse_into($) { my $dir=shift; if (opendir(DIR, $dir)) { my $x; for $x (grep(!/^\.\.?$/, readdir(DIR))) { @statbuf=lstat("$dir/$x"); if (-d(_)) { recurse_into("$dir/$x"); } elsif (-f(_)) { stuff("$dir/$x"); } else { debug basename($0), ": '$dir/$x' is not a plain file, skipping...\n"; } } } else { warn "Can't open $dir: $!\n"; } } sub debug(@) { print STDERR @_ if ($debug); } sub info(@) { print @_ if ($verbose or $debug); } sub usage($) { print STDERR <