diff --git a/README.md b/README.md index 364059d..3611ac3 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # fcompare - A tiny file comparison program written in go This is a quick way for locating identical files in a folder. -Pass to the program the folder you want to check -Build +Pass the folder you want to check to the program + +## Build ```bash go get -u github.com/omerh/fcompare @@ -14,8 +15,13 @@ cd ~/go/src/github.com/omerh/fcompare go build ``` -Run the app +## Run ```bash -./fcompare /directory +fcompare [-t] ``` + +## Args +arg name | type | default value | usage +--- | --- | --- | --- +t | bool | false | set to true for calculating hashes in parallel diff --git a/main.go b/main.go index b576ffb..ce0967d 100644 --- a/main.go +++ b/main.go @@ -4,88 +4,186 @@ import ( "bytes" "crypto/md5" "encoding/hex" + "flag" + "fmt" "io" "io/ioutil" - "log" "os" "path/filepath" + "sync" ) -func check(e error) { - if e != nil { - log.Print(e) - panic(e) - } -} - -func init() { - // Checking if only executable name passed to the program without an argument - if len(os.Args) == 1 { - log.Print("Missing argument for files direcory, Exiting...") - os.Exit(1) - } -} - func getHashForFile(folder string, file string) []byte { f, err := os.Open(filepath.Join(folder, file)) defer f.Close() - check(err) + if err != nil { + panic(err) + } h := md5.New() if _, err := io.Copy(h, f); err != nil { - check(err) + if err != nil { + panic(err) + } } return h.Sum(nil) } -func main() { - log.Print("starting app") - direcory := os.Args[1] - files, err := ioutil.ReadDir(direcory) - check(err) +/// a thread safe wrapper for a map from file names to hash with the necessary data to compute the hash +type suspectSet struct { + dir string + suspects map[string][]byte + mutex *sync.Mutex + identicals *identicalsSet +} - // map of file size to the first file name - fileSizeToCheck := make(map[int64]map[string][]byte) - // map of the identical files according to thier md5 hash - identicalFiles := make(map[string][]string) +/// a thread safe wrapper for a map from hex strings to slice of files which hash into the relavant hex string +type identicalsSet struct { + identicals map[string][]string + mutex *sync.Mutex +} + +func (identicals *identicalsSet) addIdentical(hexString, sentinelFile, checkFile string) { + identicals.mutex.Lock() + defer identicals.mutex.Unlock() + identicalSlice, inMap := identicals.identicals[hexString] + if inMap { + identicals.identicals[hexString] = append(identicalSlice, checkFile) + } else { + identicals.identicals[hexString] = []string{sentinelFile, checkFile} + } +} + +func (suspects *suspectSet) chcekSuspects(fileName string, wg *sync.WaitGroup) { + suspects.mutex.Lock() + defer func() { + suspects.mutex.Unlock() + wg.Done() + }() + + currentHash := getHashForFile(suspects.dir, fileName) + for suspect := range suspects.suspects { + if suspects.suspects[suspect] == nil { + suspects.suspects[suspect] = getHashForFile(suspects.dir, suspect) + } + if bytes.Equal(suspects.suspects[suspect], currentHash) { + hexString := hex.EncodeToString(currentHash) + suspects.identicals.addIdentical(hexString, suspect, fileName) + break + } + } + suspects.suspects[fileName] = currentHash +} + +func findDuplicatesThreaded(dir string) map[string][]string { + files, err := ioutil.ReadDir(dir) + if err != nil { + panic(err) + } + + sizeToSuspects := make(map[int64]*suspectSet) + identicals := &identicalsSet{ + mutex: &sync.Mutex{}, + identicals: make(map[string][]string), + } + wg := sync.WaitGroup{} for i := 0; i < len(files); i++ { if file := files[i]; !file.IsDir() { size := file.Size() name := file.Name() - checkedFile, inMap := fileSizeToCheck[size] + suspects, inMap := sizeToSuspects[size] if inMap { - currentFileHash := getHashForFile(direcory, name) - CHECKLOOP: - for check := range checkedFile { - if checkedFile[check] == nil { - checkedFile[check] = getHashForFile(direcory, check) - } - if bytes.Equal(checkedFile[check], currentFileHash) { - hexString := hex.EncodeToString(currentFileHash) - identicalSlice, inMap := identicalFiles[hexString] - if inMap { - identicalFiles[hexString] = append(identicalSlice, name) - } else { - identicalFiles[hexString] = []string{check, name} - } - break CHECKLOOP + wg.Add(1) + go suspects.chcekSuspects(name, &wg) + } else { + sizeToSuspects[size] = &suspectSet{ + dir: dir, + mutex: &sync.Mutex{}, + suspects: map[string][]byte{name: nil}, + identicals: identicals, + } + } + } + } + wg.Wait() + return identicals.identicals +} + +func findDuplicates(dir string) map[string][]string { + files, err := ioutil.ReadDir(dir) + if err != nil { + panic(err) + } + + sizeToSuspects := make(map[int64]map[string][]byte) + identicals := make(map[string][]string) + + for i := 0; i < len(files); i++ { + if file := files[i]; !file.IsDir() { + size := file.Size() + name := file.Name() + suspectSet, inMap := sizeToSuspects[size] + if inMap { + currentHash := getHashForFile(dir, name) + SUSPECTLOOP: + for suspect := range suspectSet { + if suspectSet[suspect] == nil { + suspectSet[suspect] = getHashForFile(dir, suspect) + } + if bytes.Equal(suspectSet[suspect], currentHash) { + hexString := hex.EncodeToString(currentHash) + identicalSlice, inMap := identicals[hexString] + if inMap { + identicals[hexString] = append(identicalSlice, name) + } else { + identicals[hexString] = []string{suspect, name} } + break SUSPECTLOOP } - checkedFile[name] = nil + } + suspectSet[name] = currentHash } else { - fileSizeToCheck[size] = map[string][]byte{name: nil} + sizeToSuspects[size] = map[string][]byte{name: nil} } } } - printResult(identicalFiles) + return identicals } -func printResult(identicalFiles map[string][]string ) { - // Print indentical files - for k, v := range identicalFiles { - log.Printf("The following files are identicals with the hash %v", k) +func cli() (bool, string) { + var threaded = flag.Bool("t", false, "set to parralelize hash calculations") + + flag.Parse() + + if flag.NArg() > 1 { + panic(fmt.Sprintf("received too many command line args: %s", flag.Args())) + } else if flag.NArg() < 1 { + panic("did not receive directory") + } + + directory := flag.Arg(0) + + return *threaded, directory +} + +func fcompare(threaded bool, dir string) { + + var identicals map[string][]string + if threaded { + identicals = findDuplicatesThreaded(dir) + } else { + identicals = findDuplicates(dir) + } + + for k, v := range identicals { + fmt.Println("The following files are identicals with the hash", k) for _, name := range v { - log.Printf("--> %v", name) + fmt.Println(" ", name) } } -} \ No newline at end of file +} + +func main() { + threaded, dir := cli() + fcompare(threaded, dir) +}