Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

my take on a threaded solution #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# fcompare - A tiny file comparison program written in go

This is a quick way for locating identical files in a folder.
Pass to the program the folder you want to check

Build
Pass the folder you want to check to the program

## Build

```bash
go get -u github.com/omerh/fcompare
Expand All @@ -14,8 +15,13 @@ cd ~/go/src/github.com/omerh/fcompare
go build
```

Run the app
## Run

```bash
./fcompare /directory
fcompare [-t] <directory>
```

## Args
arg name | type | default value | usage
--- | --- | --- | ---
t | bool | false | set to true for calculating hashes in parallel
202 changes: 150 additions & 52 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,88 +4,186 @@ import (
"bytes"
"crypto/md5"
"encoding/hex"
"flag"
"fmt"
"io"
"io/ioutil"
"log"
"os"
"path/filepath"
"sync"
)

func check(e error) {
if e != nil {
log.Print(e)
panic(e)
}
}

func init() {
// Checking if only executable name passed to the program without an argument
if len(os.Args) == 1 {
log.Print("Missing argument for files direcory, Exiting...")
os.Exit(1)
}
}

func getHashForFile(folder string, file string) []byte {
f, err := os.Open(filepath.Join(folder, file))
defer f.Close()
check(err)
if err != nil {
panic(err)
}
h := md5.New()
if _, err := io.Copy(h, f); err != nil {
check(err)
if err != nil {
panic(err)
}
}
return h.Sum(nil)
}

func main() {
log.Print("starting app")
direcory := os.Args[1]
files, err := ioutil.ReadDir(direcory)
check(err)
/// a thread safe wrapper for a map from file names to hash with the necessary data to compute the hash
type suspectSet struct {
dir string
suspects map[string][]byte
mutex *sync.Mutex
identicals *identicalsSet
}

// map of file size to the first file name
fileSizeToCheck := make(map[int64]map[string][]byte)
// map of the identical files according to thier md5 hash
identicalFiles := make(map[string][]string)
/// a thread safe wrapper for a map from hex strings to slice of files which hash into the relavant hex string
type identicalsSet struct {
identicals map[string][]string
mutex *sync.Mutex
}

func (identicals *identicalsSet) addIdentical(hexString, sentinelFile, checkFile string) {
identicals.mutex.Lock()
defer identicals.mutex.Unlock()
identicalSlice, inMap := identicals.identicals[hexString]
if inMap {
identicals.identicals[hexString] = append(identicalSlice, checkFile)
} else {
identicals.identicals[hexString] = []string{sentinelFile, checkFile}
}
}

func (suspects *suspectSet) chcekSuspects(fileName string, wg *sync.WaitGroup) {
suspects.mutex.Lock()
defer func() {
suspects.mutex.Unlock()
wg.Done()
}()

currentHash := getHashForFile(suspects.dir, fileName)
for suspect := range suspects.suspects {
if suspects.suspects[suspect] == nil {
suspects.suspects[suspect] = getHashForFile(suspects.dir, suspect)
}
if bytes.Equal(suspects.suspects[suspect], currentHash) {
hexString := hex.EncodeToString(currentHash)
suspects.identicals.addIdentical(hexString, suspect, fileName)
break
}
}
suspects.suspects[fileName] = currentHash
}

func findDuplicatesThreaded(dir string) map[string][]string {
files, err := ioutil.ReadDir(dir)
if err != nil {
panic(err)
}

sizeToSuspects := make(map[int64]*suspectSet)
identicals := &identicalsSet{
mutex: &sync.Mutex{},
identicals: make(map[string][]string),
}
wg := sync.WaitGroup{}

for i := 0; i < len(files); i++ {
if file := files[i]; !file.IsDir() {
size := file.Size()
name := file.Name()
checkedFile, inMap := fileSizeToCheck[size]
suspects, inMap := sizeToSuspects[size]
if inMap {
currentFileHash := getHashForFile(direcory, name)
CHECKLOOP:
for check := range checkedFile {
if checkedFile[check] == nil {
checkedFile[check] = getHashForFile(direcory, check)
}
if bytes.Equal(checkedFile[check], currentFileHash) {
hexString := hex.EncodeToString(currentFileHash)
identicalSlice, inMap := identicalFiles[hexString]
if inMap {
identicalFiles[hexString] = append(identicalSlice, name)
} else {
identicalFiles[hexString] = []string{check, name}
}
break CHECKLOOP
wg.Add(1)
go suspects.chcekSuspects(name, &wg)
} else {
sizeToSuspects[size] = &suspectSet{
dir: dir,
mutex: &sync.Mutex{},
suspects: map[string][]byte{name: nil},
identicals: identicals,
}
}
}
}
wg.Wait()
return identicals.identicals
}

func findDuplicates(dir string) map[string][]string {
files, err := ioutil.ReadDir(dir)
if err != nil {
panic(err)
}

sizeToSuspects := make(map[int64]map[string][]byte)
identicals := make(map[string][]string)

for i := 0; i < len(files); i++ {
if file := files[i]; !file.IsDir() {
size := file.Size()
name := file.Name()
suspectSet, inMap := sizeToSuspects[size]
if inMap {
currentHash := getHashForFile(dir, name)
SUSPECTLOOP:
for suspect := range suspectSet {
if suspectSet[suspect] == nil {
suspectSet[suspect] = getHashForFile(dir, suspect)
}
if bytes.Equal(suspectSet[suspect], currentHash) {
hexString := hex.EncodeToString(currentHash)
identicalSlice, inMap := identicals[hexString]
if inMap {
identicals[hexString] = append(identicalSlice, name)
} else {
identicals[hexString] = []string{suspect, name}
}
break SUSPECTLOOP
}
checkedFile[name] = nil
}
suspectSet[name] = currentHash
} else {
fileSizeToCheck[size] = map[string][]byte{name: nil}
sizeToSuspects[size] = map[string][]byte{name: nil}
}
}
}
printResult(identicalFiles)
return identicals
}

func printResult(identicalFiles map[string][]string ) {
// Print indentical files
for k, v := range identicalFiles {
log.Printf("The following files are identicals with the hash %v", k)
func cli() (bool, string) {
var threaded = flag.Bool("t", false, "set to parralelize hash calculations")

flag.Parse()

if flag.NArg() > 1 {
panic(fmt.Sprintf("received too many command line args: %s", flag.Args()))
} else if flag.NArg() < 1 {
panic("did not receive directory")
}

directory := flag.Arg(0)

return *threaded, directory
}

func fcompare(threaded bool, dir string) {

var identicals map[string][]string
if threaded {
identicals = findDuplicatesThreaded(dir)
} else {
identicals = findDuplicates(dir)
}

for k, v := range identicals {
fmt.Println("The following files are identicals with the hash", k)
for _, name := range v {
log.Printf("--> %v", name)
fmt.Println(" ", name)
}
}
}
}

func main() {
threaded, dir := cli()
fcompare(threaded, dir)
}