Finding and deleting duplicate files in a folder

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
7
down vote

favorite
1












The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.



The helper function sort_duplicates determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.



As a self-learner of computer science, I would very appreciate any feedback.



Code is written on Python 3.6.4



import os
import hashlib
import collections
import copy

import test_duplicates

def calculate_hash(file):
"""
file: path to a file object

Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()

def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete

def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep

to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""

duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)

# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break

groups = [values for values in sizes_dict.values()
if len(values) > 1]

for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)

duplicates = [values for values in duplicates.values()
if len(values) > 1]

if not duplicates:
print('No duplicates has been found')
return None

print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()

if not delete:
return

to_keep, to_delete = sort_duplicates(duplicates)

print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')

for path in to_delete:
os.remove(path)

print('Done')

return duplicates, to_keep, to_delete

if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')


Output:



----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1

----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1

----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done


There is test_duplicates module.



import py.path
import random


from duplicates import find_duplicates


def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied

returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted

"""

dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,

for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)

paths.append(path)
assert path.check()

# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()

# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()

to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)

return duplicates, to_keep, to_delete


def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""

exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates

assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)

no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None






share|improve this question





















  • You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
    – Roland Illig
    Jan 16 at 19:18










  • If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
    – Maarten Fabré
    Mar 2 at 14:23










  • Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
    – Jaydeb Nilson
    Apr 14 at 23:01
















up vote
7
down vote

favorite
1












The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.



The helper function sort_duplicates determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.



As a self-learner of computer science, I would very appreciate any feedback.



Code is written on Python 3.6.4



import os
import hashlib
import collections
import copy

import test_duplicates

def calculate_hash(file):
"""
file: path to a file object

Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()

def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete

def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep

to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""

duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)

# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break

groups = [values for values in sizes_dict.values()
if len(values) > 1]

for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)

duplicates = [values for values in duplicates.values()
if len(values) > 1]

if not duplicates:
print('No duplicates has been found')
return None

print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()

if not delete:
return

to_keep, to_delete = sort_duplicates(duplicates)

print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')

for path in to_delete:
os.remove(path)

print('Done')

return duplicates, to_keep, to_delete

if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')


Output:



----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1

----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1

----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done


There is test_duplicates module.



import py.path
import random


from duplicates import find_duplicates


def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied

returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted

"""

dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,

for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)

paths.append(path)
assert path.check()

# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()

# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()

to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)

return duplicates, to_keep, to_delete


def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""

exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates

assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)

no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None






share|improve this question





















  • You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
    – Roland Illig
    Jan 16 at 19:18










  • If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
    – Maarten Fabré
    Mar 2 at 14:23










  • Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
    – Jaydeb Nilson
    Apr 14 at 23:01












up vote
7
down vote

favorite
1









up vote
7
down vote

favorite
1






1





The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.



The helper function sort_duplicates determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.



As a self-learner of computer science, I would very appreciate any feedback.



Code is written on Python 3.6.4



import os
import hashlib
import collections
import copy

import test_duplicates

def calculate_hash(file):
"""
file: path to a file object

Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()

def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete

def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep

to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""

duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)

# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break

groups = [values for values in sizes_dict.values()
if len(values) > 1]

for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)

duplicates = [values for values in duplicates.values()
if len(values) > 1]

if not duplicates:
print('No duplicates has been found')
return None

print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()

if not delete:
return

to_keep, to_delete = sort_duplicates(duplicates)

print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')

for path in to_delete:
os.remove(path)

print('Done')

return duplicates, to_keep, to_delete

if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')


Output:



----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1

----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1

----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done


There is test_duplicates module.



import py.path
import random


from duplicates import find_duplicates


def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied

returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted

"""

dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,

for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)

paths.append(path)
assert path.check()

# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()

# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()

to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)

return duplicates, to_keep, to_delete


def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""

exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates

assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)

no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None






share|improve this question













The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.



The helper function sort_duplicates determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.



As a self-learner of computer science, I would very appreciate any feedback.



Code is written on Python 3.6.4



import os
import hashlib
import collections
import copy

import test_duplicates

def calculate_hash(file):
"""
file: path to a file object

Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()

def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete

def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep

to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""

duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)

# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break

groups = [values for values in sizes_dict.values()
if len(values) > 1]

for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)

duplicates = [values for values in duplicates.values()
if len(values) > 1]

if not duplicates:
print('No duplicates has been found')
return None

print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()

if not delete:
return

to_keep, to_delete = sort_duplicates(duplicates)

print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')

for path in to_delete:
os.remove(path)

print('Done')

return duplicates, to_keep, to_delete

if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')


Output:



----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1

----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1

----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done


There is test_duplicates module.



import py.path
import random


from duplicates import find_duplicates


def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied

returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted

"""

dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,

for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)

paths.append(path)
assert path.check()

# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()

# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()

to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)

return duplicates, to_keep, to_delete


def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""

exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates

assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)

no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None








share|improve this question












share|improve this question




share|improve this question








edited Jan 16 at 23:32









200_success

123k14143401




123k14143401









asked Jan 16 at 18:18









Ayenn Ul

605




605











  • You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
    – Roland Illig
    Jan 16 at 19:18










  • If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
    – Maarten Fabré
    Mar 2 at 14:23










  • Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
    – Jaydeb Nilson
    Apr 14 at 23:01
















  • You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
    – Roland Illig
    Jan 16 at 19:18










  • If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
    – Maarten Fabré
    Mar 2 at 14:23










  • Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
    – Jaydeb Nilson
    Apr 14 at 23:01















You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
– Roland Illig
Jan 16 at 19:18




You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
– Roland Illig
Jan 16 at 19:18












If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
– Maarten Fabré
Mar 2 at 14:23




If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
– Maarten Fabré
Mar 2 at 14:23












Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
– Jaydeb Nilson
Apr 14 at 23:01




Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
– Jaydeb Nilson
Apr 14 at 23:01










3 Answers
3






active

oldest

votes

















up vote
6
down vote



accepted











  • I think you should make a chunk_file function, so your code is more DRY.



    This would mean splitting calculate_hash and chunk_file into two.



  • I think you should be able to pass what type of hash you want calculate_hash to return. This is as you can have hash-collisions.



  • find_duplicates should be split into more functions.



    • You should make a files_under function that finds all the files under a specific directory.

    • You should make a filter_hidden function, so you can change the input of find_duplicates, rather than relying on find_duplicates to do everything.


    • find_duplicates shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.


  • Rather than using if not recursive: break I'd filter the input to the loop.


  • I wouldn't build a groups intermarry list. As you're doubling the amount of memory you're using for no viable gain.

  • I'd merge your duplicates and sizes_dict code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend.

  • I would use pathlib so that you have an easier to use object, than a string.

I changed your code, however didn't implement all of it. I find sort_duplicates to be quite confusing, and so left that as it is.



Below are my changes:



import functools
import os
import pathlib


def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data


def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()


def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]

for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name


def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path


def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]

prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue

for item in group:
new[method(item)].add(item)
prev = new
return prev





share|improve this answer





















  • Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
    – Ayenn Ul
    Jan 18 at 8:47











  • The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
    – Ayenn Ul
    Jan 18 at 10:35










  • @OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
    – Peilonrayz
    Jan 18 at 10:41










  • making the search recursive or not with pathlib.Path is even simpler.
    – Maarten Fabré
    Mar 2 at 14:19

















up vote
5
down vote













Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates function. This one seems a bit convoluted and messy.



More specifically:



  1. The list comprehension will throw an Error if the list comes back empty ([0] -> IndexError). As a result, checking for if file does not make sense. It would have already failed.

  2. An entire list comprehension is not needed if you just want the first item. Use next instead; it would be much faster and can provide you with a default value in case nothing is found.

  3. The if and else blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check.

  4. In the now meaningless else there is a group.pop(file) when file does not exist. What's up with that?

  5. The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.

So I would re-write as:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


I will be updating my answer if I see anything else worth pointing out.




Based on your comments below, we can make further arrangements to code in a fall-back scenario:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete


Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.






share|improve this answer























  • Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
    – Ayenn Ul
    Jan 17 at 16:02











  • @OzRamsay You are welcome. Is there a reason you posted that code in the comment?
    – Ev. Kounis
    Jan 17 at 16:05










  • Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
    – Ayenn Ul
    Jan 17 at 16:12










  • @OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
    – Ev. Kounis
    Jan 17 at 16:19







  • 1




    @OzRamsay Take a look! This might make it even clearer.
    – Ev. Kounis
    Jan 17 at 16:25


















up vote
2
down vote













I didn't analyze the whole program carefully, so this review is not complete from my point of view.



Use the Right Library Calls



If you read/write files be carefully that you do it in the right way. You can open a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn and a file containing rnn contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (nis the newline-character and r is the carriage return character.



Improve your Algorithm



Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.



If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.



Don't Use Magic Constants



You use the magic string constant 'cop' somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True) where this global variable for 'cop' is used.



There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:




4098 Byte = 4 KByte + 2 Byte


Are you sure that 4098 is the value you want to use?



In your special case it seems that you have a special treatment for path names that contain /copied/ but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.



Different Names for Different Variables



I also don't like a statement where the same variable name has two different meanings like in



file = [file for file in group if ('cop') not in file][0]


file on the right of the = expreession is a different variable then on the left of =. Better write



file = [f for f in group if ('cop') not in f][0]


or something similar, this makes it easier to understand.



Don't Use deepcopy Automatically



You use copy.deepcopy. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like



group.remove(file)
to_delete.extend(group)


and



to_keep.append(group.pop(file))
to_delete.extend(group)


to



g = copy.copy(group)
g.remove(file)
to_delete.extend(g)


and



g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)


or you replace



duplicates = copy.deepcopy(duplicates)


by



duplicates = [set(group) for group in duplicates]


It depends on the types of objects if deepcopy makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy point to the same object:



>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992


But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.



Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related



Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop' thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.






share|improve this answer























  • edited the post: some notes about the constant 4098 used in the code
    – miracle173
    Mar 2 at 11:25










Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f185244%2ffinding-and-deleting-duplicate-files-in-a-folder%23new-answer', 'question_page');

);

Post as a guest






























3 Answers
3






active

oldest

votes








3 Answers
3






active

oldest

votes









active

oldest

votes






active

oldest

votes








up vote
6
down vote



accepted











  • I think you should make a chunk_file function, so your code is more DRY.



    This would mean splitting calculate_hash and chunk_file into two.



  • I think you should be able to pass what type of hash you want calculate_hash to return. This is as you can have hash-collisions.



  • find_duplicates should be split into more functions.



    • You should make a files_under function that finds all the files under a specific directory.

    • You should make a filter_hidden function, so you can change the input of find_duplicates, rather than relying on find_duplicates to do everything.


    • find_duplicates shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.


  • Rather than using if not recursive: break I'd filter the input to the loop.


  • I wouldn't build a groups intermarry list. As you're doubling the amount of memory you're using for no viable gain.

  • I'd merge your duplicates and sizes_dict code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend.

  • I would use pathlib so that you have an easier to use object, than a string.

I changed your code, however didn't implement all of it. I find sort_duplicates to be quite confusing, and so left that as it is.



Below are my changes:



import functools
import os
import pathlib


def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data


def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()


def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]

for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name


def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path


def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]

prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue

for item in group:
new[method(item)].add(item)
prev = new
return prev





share|improve this answer





















  • Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
    – Ayenn Ul
    Jan 18 at 8:47











  • The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
    – Ayenn Ul
    Jan 18 at 10:35










  • @OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
    – Peilonrayz
    Jan 18 at 10:41










  • making the search recursive or not with pathlib.Path is even simpler.
    – Maarten Fabré
    Mar 2 at 14:19














up vote
6
down vote



accepted











  • I think you should make a chunk_file function, so your code is more DRY.



    This would mean splitting calculate_hash and chunk_file into two.



  • I think you should be able to pass what type of hash you want calculate_hash to return. This is as you can have hash-collisions.



  • find_duplicates should be split into more functions.



    • You should make a files_under function that finds all the files under a specific directory.

    • You should make a filter_hidden function, so you can change the input of find_duplicates, rather than relying on find_duplicates to do everything.


    • find_duplicates shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.


  • Rather than using if not recursive: break I'd filter the input to the loop.


  • I wouldn't build a groups intermarry list. As you're doubling the amount of memory you're using for no viable gain.

  • I'd merge your duplicates and sizes_dict code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend.

  • I would use pathlib so that you have an easier to use object, than a string.

I changed your code, however didn't implement all of it. I find sort_duplicates to be quite confusing, and so left that as it is.



Below are my changes:



import functools
import os
import pathlib


def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data


def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()


def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]

for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name


def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path


def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]

prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue

for item in group:
new[method(item)].add(item)
prev = new
return prev





share|improve this answer





















  • Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
    – Ayenn Ul
    Jan 18 at 8:47











  • The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
    – Ayenn Ul
    Jan 18 at 10:35










  • @OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
    – Peilonrayz
    Jan 18 at 10:41










  • making the search recursive or not with pathlib.Path is even simpler.
    – Maarten Fabré
    Mar 2 at 14:19












up vote
6
down vote



accepted







up vote
6
down vote



accepted







  • I think you should make a chunk_file function, so your code is more DRY.



    This would mean splitting calculate_hash and chunk_file into two.



  • I think you should be able to pass what type of hash you want calculate_hash to return. This is as you can have hash-collisions.



  • find_duplicates should be split into more functions.



    • You should make a files_under function that finds all the files under a specific directory.

    • You should make a filter_hidden function, so you can change the input of find_duplicates, rather than relying on find_duplicates to do everything.


    • find_duplicates shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.


  • Rather than using if not recursive: break I'd filter the input to the loop.


  • I wouldn't build a groups intermarry list. As you're doubling the amount of memory you're using for no viable gain.

  • I'd merge your duplicates and sizes_dict code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend.

  • I would use pathlib so that you have an easier to use object, than a string.

I changed your code, however didn't implement all of it. I find sort_duplicates to be quite confusing, and so left that as it is.



Below are my changes:



import functools
import os
import pathlib


def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data


def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()


def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]

for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name


def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path


def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]

prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue

for item in group:
new[method(item)].add(item)
prev = new
return prev





share|improve this answer














  • I think you should make a chunk_file function, so your code is more DRY.



    This would mean splitting calculate_hash and chunk_file into two.



  • I think you should be able to pass what type of hash you want calculate_hash to return. This is as you can have hash-collisions.



  • find_duplicates should be split into more functions.



    • You should make a files_under function that finds all the files under a specific directory.

    • You should make a filter_hidden function, so you can change the input of find_duplicates, rather than relying on find_duplicates to do everything.


    • find_duplicates shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.


  • Rather than using if not recursive: break I'd filter the input to the loop.


  • I wouldn't build a groups intermarry list. As you're doubling the amount of memory you're using for no viable gain.

  • I'd merge your duplicates and sizes_dict code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend.

  • I would use pathlib so that you have an easier to use object, than a string.

I changed your code, however didn't implement all of it. I find sort_duplicates to be quite confusing, and so left that as it is.



Below are my changes:



import functools
import os
import pathlib


def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data


def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()


def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]

for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name


def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path


def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]

prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue

for item in group:
new[method(item)].add(item)
prev = new
return prev






share|improve this answer













share|improve this answer



share|improve this answer











answered Jan 17 at 18:41









Peilonrayz

24.4k336102




24.4k336102











  • Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
    – Ayenn Ul
    Jan 18 at 8:47











  • The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
    – Ayenn Ul
    Jan 18 at 10:35










  • @OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
    – Peilonrayz
    Jan 18 at 10:41










  • making the search recursive or not with pathlib.Path is even simpler.
    – Maarten Fabré
    Mar 2 at 14:19
















  • Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
    – Ayenn Ul
    Jan 18 at 8:47











  • The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
    – Ayenn Ul
    Jan 18 at 10:35










  • @OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
    – Peilonrayz
    Jan 18 at 10:41










  • making the search recursive or not with pathlib.Path is even simpler.
    – Maarten Fabré
    Mar 2 at 14:19















Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
– Ayenn Ul
Jan 18 at 8:47





Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using py.path module, as it implements path.computehash method. I have tried to compare performances of the custom compute_hash function and py.path.computehash method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
– Ayenn Ul
Jan 18 at 8:47













The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
– Ayenn Ul
Jan 18 at 10:35




The way you limited the non-recursive search by using the first outcome of the next function in files_under function is very neat. In this function, you yield the root / file_name, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join everywhere, or are there some limitations? I also searched for _CHUNK_SIZE attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE must be based on something else?
– Ayenn Ul
Jan 18 at 10:35












@OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
– Peilonrayz
Jan 18 at 10:41




@OzRamsay Thank you, :) Yes root / file_name is a high-level alturnate to os.path.join. As far as I know there are no limitations. I used dir(file), and noticed a _CHUNK_SIZE variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
– Peilonrayz
Jan 18 at 10:41












making the search recursive or not with pathlib.Path is even simpler.
– Maarten Fabré
Mar 2 at 14:19




making the search recursive or not with pathlib.Path is even simpler.
– Maarten Fabré
Mar 2 at 14:19












up vote
5
down vote













Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates function. This one seems a bit convoluted and messy.



More specifically:



  1. The list comprehension will throw an Error if the list comes back empty ([0] -> IndexError). As a result, checking for if file does not make sense. It would have already failed.

  2. An entire list comprehension is not needed if you just want the first item. Use next instead; it would be much faster and can provide you with a default value in case nothing is found.

  3. The if and else blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check.

  4. In the now meaningless else there is a group.pop(file) when file does not exist. What's up with that?

  5. The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.

So I would re-write as:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


I will be updating my answer if I see anything else worth pointing out.




Based on your comments below, we can make further arrangements to code in a fall-back scenario:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete


Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.






share|improve this answer























  • Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
    – Ayenn Ul
    Jan 17 at 16:02











  • @OzRamsay You are welcome. Is there a reason you posted that code in the comment?
    – Ev. Kounis
    Jan 17 at 16:05










  • Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
    – Ayenn Ul
    Jan 17 at 16:12










  • @OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
    – Ev. Kounis
    Jan 17 at 16:19







  • 1




    @OzRamsay Take a look! This might make it even clearer.
    – Ev. Kounis
    Jan 17 at 16:25















up vote
5
down vote













Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates function. This one seems a bit convoluted and messy.



More specifically:



  1. The list comprehension will throw an Error if the list comes back empty ([0] -> IndexError). As a result, checking for if file does not make sense. It would have already failed.

  2. An entire list comprehension is not needed if you just want the first item. Use next instead; it would be much faster and can provide you with a default value in case nothing is found.

  3. The if and else blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check.

  4. In the now meaningless else there is a group.pop(file) when file does not exist. What's up with that?

  5. The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.

So I would re-write as:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


I will be updating my answer if I see anything else worth pointing out.




Based on your comments below, we can make further arrangements to code in a fall-back scenario:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete


Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.






share|improve this answer























  • Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
    – Ayenn Ul
    Jan 17 at 16:02











  • @OzRamsay You are welcome. Is there a reason you posted that code in the comment?
    – Ev. Kounis
    Jan 17 at 16:05










  • Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
    – Ayenn Ul
    Jan 17 at 16:12










  • @OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
    – Ev. Kounis
    Jan 17 at 16:19







  • 1




    @OzRamsay Take a look! This might make it even clearer.
    – Ev. Kounis
    Jan 17 at 16:25













up vote
5
down vote










up vote
5
down vote









Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates function. This one seems a bit convoluted and messy.



More specifically:



  1. The list comprehension will throw an Error if the list comes back empty ([0] -> IndexError). As a result, checking for if file does not make sense. It would have already failed.

  2. An entire list comprehension is not needed if you just want the first item. Use next instead; it would be much faster and can provide you with a default value in case nothing is found.

  3. The if and else blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check.

  4. In the now meaningless else there is a group.pop(file) when file does not exist. What's up with that?

  5. The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.

So I would re-write as:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


I will be updating my answer if I see anything else worth pointing out.




Based on your comments below, we can make further arrangements to code in a fall-back scenario:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete


Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.






share|improve this answer















Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates function. This one seems a bit convoluted and messy.



More specifically:



  1. The list comprehension will throw an Error if the list comes back empty ([0] -> IndexError). As a result, checking for if file does not make sense. It would have already failed.

  2. An entire list comprehension is not needed if you just want the first item. Use next instead; it would be much faster and can provide you with a default value in case nothing is found.

  3. The if and else blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check.

  4. In the now meaningless else there is a group.pop(file) when file does not exist. What's up with that?

  5. The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.

So I would re-write as:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete


I will be updating my answer if I see anything else worth pointing out.




Based on your comments below, we can make further arrangements to code in a fall-back scenario:



def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""

duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete


Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.







share|improve this answer















share|improve this answer



share|improve this answer








edited Jan 17 at 16:34


























answered Jan 17 at 10:13









Ev. Kounis

620313




620313











  • Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
    – Ayenn Ul
    Jan 17 at 16:02











  • @OzRamsay You are welcome. Is there a reason you posted that code in the comment?
    – Ev. Kounis
    Jan 17 at 16:05










  • Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
    – Ayenn Ul
    Jan 17 at 16:12










  • @OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
    – Ev. Kounis
    Jan 17 at 16:19







  • 1




    @OzRamsay Take a look! This might make it even clearer.
    – Ev. Kounis
    Jan 17 at 16:25

















  • Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
    – Ayenn Ul
    Jan 17 at 16:02











  • @OzRamsay You are welcome. Is there a reason you posted that code in the comment?
    – Ev. Kounis
    Jan 17 at 16:05










  • Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
    – Ayenn Ul
    Jan 17 at 16:12










  • @OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
    – Ev. Kounis
    Jan 17 at 16:19







  • 1




    @OzRamsay Take a look! This might make it even clearer.
    – Ev. Kounis
    Jan 17 at 16:25
















Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
– Ayenn Ul
Jan 17 at 16:02





Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
– Ayenn Ul
Jan 17 at 16:02













@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
– Ev. Kounis
Jan 17 at 16:05




@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
– Ev. Kounis
Jan 17 at 16:05












Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
– Ayenn Ul
Jan 17 at 16:12




Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
– Ayenn Ul
Jan 17 at 16:12












@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
– Ev. Kounis
Jan 17 at 16:19





@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the listcomes back empty your code raises an Error... There is no falling back to the previous file value.
– Ev. Kounis
Jan 17 at 16:19





1




1




@OzRamsay Take a look! This might make it even clearer.
– Ev. Kounis
Jan 17 at 16:25





@OzRamsay Take a look! This might make it even clearer.
– Ev. Kounis
Jan 17 at 16:25











up vote
2
down vote













I didn't analyze the whole program carefully, so this review is not complete from my point of view.



Use the Right Library Calls



If you read/write files be carefully that you do it in the right way. You can open a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn and a file containing rnn contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (nis the newline-character and r is the carriage return character.



Improve your Algorithm



Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.



If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.



Don't Use Magic Constants



You use the magic string constant 'cop' somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True) where this global variable for 'cop' is used.



There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:




4098 Byte = 4 KByte + 2 Byte


Are you sure that 4098 is the value you want to use?



In your special case it seems that you have a special treatment for path names that contain /copied/ but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.



Different Names for Different Variables



I also don't like a statement where the same variable name has two different meanings like in



file = [file for file in group if ('cop') not in file][0]


file on the right of the = expreession is a different variable then on the left of =. Better write



file = [f for f in group if ('cop') not in f][0]


or something similar, this makes it easier to understand.



Don't Use deepcopy Automatically



You use copy.deepcopy. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like



group.remove(file)
to_delete.extend(group)


and



to_keep.append(group.pop(file))
to_delete.extend(group)


to



g = copy.copy(group)
g.remove(file)
to_delete.extend(g)


and



g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)


or you replace



duplicates = copy.deepcopy(duplicates)


by



duplicates = [set(group) for group in duplicates]


It depends on the types of objects if deepcopy makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy point to the same object:



>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992


But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.



Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related



Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop' thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.






share|improve this answer























  • edited the post: some notes about the constant 4098 used in the code
    – miracle173
    Mar 2 at 11:25














up vote
2
down vote













I didn't analyze the whole program carefully, so this review is not complete from my point of view.



Use the Right Library Calls



If you read/write files be carefully that you do it in the right way. You can open a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn and a file containing rnn contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (nis the newline-character and r is the carriage return character.



Improve your Algorithm



Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.



If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.



Don't Use Magic Constants



You use the magic string constant 'cop' somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True) where this global variable for 'cop' is used.



There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:




4098 Byte = 4 KByte + 2 Byte


Are you sure that 4098 is the value you want to use?



In your special case it seems that you have a special treatment for path names that contain /copied/ but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.



Different Names for Different Variables



I also don't like a statement where the same variable name has two different meanings like in



file = [file for file in group if ('cop') not in file][0]


file on the right of the = expreession is a different variable then on the left of =. Better write



file = [f for f in group if ('cop') not in f][0]


or something similar, this makes it easier to understand.



Don't Use deepcopy Automatically



You use copy.deepcopy. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like



group.remove(file)
to_delete.extend(group)


and



to_keep.append(group.pop(file))
to_delete.extend(group)


to



g = copy.copy(group)
g.remove(file)
to_delete.extend(g)


and



g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)


or you replace



duplicates = copy.deepcopy(duplicates)


by



duplicates = [set(group) for group in duplicates]


It depends on the types of objects if deepcopy makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy point to the same object:



>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992


But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.



Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related



Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop' thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.






share|improve this answer























  • edited the post: some notes about the constant 4098 used in the code
    – miracle173
    Mar 2 at 11:25












up vote
2
down vote










up vote
2
down vote









I didn't analyze the whole program carefully, so this review is not complete from my point of view.



Use the Right Library Calls



If you read/write files be carefully that you do it in the right way. You can open a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn and a file containing rnn contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (nis the newline-character and r is the carriage return character.



Improve your Algorithm



Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.



If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.



Don't Use Magic Constants



You use the magic string constant 'cop' somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True) where this global variable for 'cop' is used.



There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:




4098 Byte = 4 KByte + 2 Byte


Are you sure that 4098 is the value you want to use?



In your special case it seems that you have a special treatment for path names that contain /copied/ but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.



Different Names for Different Variables



I also don't like a statement where the same variable name has two different meanings like in



file = [file for file in group if ('cop') not in file][0]


file on the right of the = expreession is a different variable then on the left of =. Better write



file = [f for f in group if ('cop') not in f][0]


or something similar, this makes it easier to understand.



Don't Use deepcopy Automatically



You use copy.deepcopy. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like



group.remove(file)
to_delete.extend(group)


and



to_keep.append(group.pop(file))
to_delete.extend(group)


to



g = copy.copy(group)
g.remove(file)
to_delete.extend(g)


and



g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)


or you replace



duplicates = copy.deepcopy(duplicates)


by



duplicates = [set(group) for group in duplicates]


It depends on the types of objects if deepcopy makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy point to the same object:



>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992


But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.



Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related



Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop' thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.






share|improve this answer















I didn't analyze the whole program carefully, so this review is not complete from my point of view.



Use the Right Library Calls



If you read/write files be carefully that you do it in the right way. You can open a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn and a file containing rnn contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (nis the newline-character and r is the carriage return character.



Improve your Algorithm



Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.



If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.



Don't Use Magic Constants



You use the magic string constant 'cop' somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True) where this global variable for 'cop' is used.



There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:




4098 Byte = 4 KByte + 2 Byte


Are you sure that 4098 is the value you want to use?



In your special case it seems that you have a special treatment for path names that contain /copied/ but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.



Different Names for Different Variables



I also don't like a statement where the same variable name has two different meanings like in



file = [file for file in group if ('cop') not in file][0]


file on the right of the = expreession is a different variable then on the left of =. Better write



file = [f for f in group if ('cop') not in f][0]


or something similar, this makes it easier to understand.



Don't Use deepcopy Automatically



You use copy.deepcopy. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like



group.remove(file)
to_delete.extend(group)


and



to_keep.append(group.pop(file))
to_delete.extend(group)


to



g = copy.copy(group)
g.remove(file)
to_delete.extend(g)


and



g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)


or you replace



duplicates = copy.deepcopy(duplicates)


by



duplicates = [set(group) for group in duplicates]


It depends on the types of objects if deepcopy makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy point to the same object:



>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992


But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.



Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related



Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop' thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.







share|improve this answer















share|improve this answer



share|improve this answer








edited Mar 2 at 16:24


























answered Mar 2 at 9:49









miracle173

1,014514




1,014514











  • edited the post: some notes about the constant 4098 used in the code
    – miracle173
    Mar 2 at 11:25
















  • edited the post: some notes about the constant 4098 used in the code
    – miracle173
    Mar 2 at 11:25















edited the post: some notes about the constant 4098 used in the code
– miracle173
Mar 2 at 11:25




edited the post: some notes about the constant 4098 used in the code
– miracle173
Mar 2 at 11:25












 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f185244%2ffinding-and-deleting-duplicate-files-in-a-folder%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Chat program with C++ and SFML

Function to Return a JSON Like Objects Using VBA Collections and Arrays

Will my employers contract hold up in court?