Finding and deleting duplicate files in a folder
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
7
down vote
favorite
The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.
The helper function sort_duplicates
determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.
As a self-learner of computer science, I would very appreciate any feedback.
Code is written on Python 3.6.4
import os
import hashlib
import collections
import copy
import test_duplicates
def calculate_hash(file):
"""
file: path to a file object
Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep
to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""
duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)
# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break
groups = [values for values in sizes_dict.values()
if len(values) > 1]
for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)
duplicates = [values for values in duplicates.values()
if len(values) > 1]
if not duplicates:
print('No duplicates has been found')
return None
print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()
if not delete:
return
to_keep, to_delete = sort_duplicates(duplicates)
print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')
for path in to_delete:
os.remove(path)
print('Done')
return duplicates, to_keep, to_delete
if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')
Output:
----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1
----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1
----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done
There is test_duplicates
module.
import py.path
import random
from duplicates import find_duplicates
def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied
returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted
"""
dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,
for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)
paths.append(path)
assert path.check()
# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()
# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()
to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)
return duplicates, to_keep, to_delete
def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""
exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates
assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)
no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None
python file-system
add a comment |Â
up vote
7
down vote
favorite
The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.
The helper function sort_duplicates
determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.
As a self-learner of computer science, I would very appreciate any feedback.
Code is written on Python 3.6.4
import os
import hashlib
import collections
import copy
import test_duplicates
def calculate_hash(file):
"""
file: path to a file object
Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep
to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""
duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)
# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break
groups = [values for values in sizes_dict.values()
if len(values) > 1]
for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)
duplicates = [values for values in duplicates.values()
if len(values) > 1]
if not duplicates:
print('No duplicates has been found')
return None
print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()
if not delete:
return
to_keep, to_delete = sort_duplicates(duplicates)
print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')
for path in to_delete:
os.remove(path)
print('Done')
return duplicates, to_keep, to_delete
if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')
Output:
----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1
----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1
----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done
There is test_duplicates
module.
import py.path
import random
from duplicates import find_duplicates
def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied
returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted
"""
dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,
for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)
paths.append(path)
assert path.check()
# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()
# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()
to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)
return duplicates, to_keep, to_delete
def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""
exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates
assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)
no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None
python file-system
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01
add a comment |Â
up vote
7
down vote
favorite
up vote
7
down vote
favorite
The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.
The helper function sort_duplicates
determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.
As a self-learner of computer science, I would very appreciate any feedback.
Code is written on Python 3.6.4
import os
import hashlib
import collections
import copy
import test_duplicates
def calculate_hash(file):
"""
file: path to a file object
Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep
to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""
duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)
# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break
groups = [values for values in sizes_dict.values()
if len(values) > 1]
for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)
duplicates = [values for values in duplicates.values()
if len(values) > 1]
if not duplicates:
print('No duplicates has been found')
return None
print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()
if not delete:
return
to_keep, to_delete = sort_duplicates(duplicates)
print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')
for path in to_delete:
os.remove(path)
print('Done')
return duplicates, to_keep, to_delete
if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')
Output:
----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1
----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1
----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done
There is test_duplicates
module.
import py.path
import random
from duplicates import find_duplicates
def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied
returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted
"""
dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,
for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)
paths.append(path)
assert path.check()
# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()
# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()
to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)
return duplicates, to_keep, to_delete
def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""
exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates
assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)
no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None
python file-system
The following code aims to identify and delete duplicate files in a folder. At first, files are compared by size, and then (if needed) by hash.
The helper function sort_duplicates
determines which file of the group of duplicates to keep. I have skipped the part of the code dealing with CLI.
As a self-learner of computer science, I would very appreciate any feedback.
Code is written on Python 3.6.4
import os
import hashlib
import collections
import copy
import test_duplicates
def calculate_hash(file):
"""
file: path to a file object
Returns a hash of a provided file object
"""
md5 = hashlib.md5()
with open(file, 'rb') as fh:
for chunk in iter(lambda: fh.read(4098), b''):
md5.update(chunk)
return md5.hexdigest()
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
"""returns
list of duplicates grouped in sets by common hash
list of duplicates proposed to delete
list of duplicated proposed to keep
to_keep contains an arbitrary duplicate from each group of duplicates if
path of the duplicate does not contain string 'cop'
if such duplicate does not exist to_keep collects any arbitrary
duplicate from the group
to_delete contains the rest of the duplicates
"""
duplicates = collections.defaultdict(set)
sizes_dict = collections.defaultdict(set)
# filter by size
for root, dirnames, filenames in os.walk(directory):
if exclude_hidden:
filenames = (filename for filename in filenames
if not filename.startswith('.'))
for filename in filenames:
path = os.path.join(root, filename)
sizes_dict[os.path.getsize(path)].add(path)
if not recursive:
break
groups = [values for values in sizes_dict.values()
if len(values) > 1]
for group in groups:
for item in group:
duplicates[calculate_hash(item)].add(item)
duplicates = [values for values in duplicates.values()
if len(values) > 1]
if not duplicates:
print('No duplicates has been found')
return None
print('----Found following groups of duplicates: ----------------------')
for group in duplicates:
print(' - '.join(str(duplicate) for duplicate in group), end='n')
print()
if not delete:
return
to_keep, to_delete = sort_duplicates(duplicates)
print('----Following files from the list of duplicates will be kept:')
print(*to_keep, sep='n', end='n')
print()
print('----Following files from the list of duplicates will be deleted:')
print(*to_delete, sep='n')
for path in to_delete:
os.remove(path)
print('Done')
return duplicates, to_keep, to_delete
if __name__ == '__main__':
test_duplicates.make_files('/tmp/d')
find_duplicates('/tmp/d')
Output:
----Found following groups of duplicates: ----------------------
/tmp/d/2 - /tmp/d/copied/2
/tmp/d/copied/4 - /tmp/d/4
/tmp/d/3 - /tmp/d/copied/3
/tmp/d/copied/0 - /tmp/d/0
/tmp/d/copied/1 - /tmp/d/1
----Following files from the list of duplicates will be kept:
/tmp/d/2
/tmp/d/4
/tmp/d/3
/tmp/d/0
/tmp/d/1
----Following files from the list of duplicates will be deleted:
/tmp/d/copied/2
/tmp/d/copied/4
/tmp/d/copied/3
/tmp/d/copied/0
/tmp/d/copied/1
Done
There is test_duplicates
module.
import py.path
import random
from duplicates import find_duplicates
def make_files(directory):
"""
Creates dummy objects in a given directory:
10 unique file objects in the parent directory
copies of the first 5 objects in the subfolder 'copied'
changed copies (added empty string) of second 5 objects in
the subfolder copied
returns: mapping of dublicate objects
list of objects to keep
list of objects to deleted
"""
dir_path = py.path.local(directory)
copied_path = dir_path.join('copied')
copied_path.ensure(dir=True)
changed_path = dir_path.join('changed')
changed_path.ensure(dir=True)
duplicates, paths, to_delete = , ,
for ind in range(10):
path = dir_path.join(str(ind))
path.write(random.randrange(ind * 10 - 10, ind * 10), ensure=True)
paths.append(path)
assert path.check()
# copy first 5 files into the subfolder 'copy'
for path in paths[:5]:
path_copy = copied_path.join(path.basename)
path.copy(path_copy)
paths.append(path_copy)
to_delete.append(path_copy)
duplicates.append(str(path), str(path_copy))
assert path_copy.check()
# copy second 5 files and add empty string to each of them:
for path in paths[5:10]:
path_changed = changed_path.join(path.basename)
path.copy(path_changed)
path_changed.write('n', mode='a')
paths.append(path_changed)
assert path_changed.check()
to_keep = [str(path) for path in paths[:5]]
paths.extend((copied_path, changed_path))
files = sorted(list(dir_path.visit()))
assert files == sorted(paths)
return duplicates, to_keep, to_delete
def test_duplicates(tmpdir):
"""Compares received and expected results of find_duplicate function"""
exp_duplicates, exp_to_keep, exp_to_delete = make_files(tmpdir)
rec_duplicates, rec_to_keep, rec_to_delete = find_duplicates(
tmpdir, recursive=True, delete=True)
assert len(exp_duplicates) == len(rec_duplicates)
for group in rec_duplicates:
assert group in exp_duplicates
assert sorted(exp_to_keep) == sorted(rec_to_keep)
assert sorted(exp_to_delete) == sorted(rec_to_delete)
no_duplicates = tmpdir.join('copied')
assert find_duplicates(no_duplicates) == None
python file-system
edited Jan 16 at 23:32
200_success
123k14143401
123k14143401
asked Jan 16 at 18:18
Ayenn Ul
605
605
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01
add a comment |Â
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01
add a comment |Â
3 Answers
3
active
oldest
votes
up vote
6
down vote
accepted
I think you should make a
chunk_file
function, so your code is more DRY.This would mean splitting
calculate_hash
andchunk_file
into two.I think you should be able to pass what type of hash you want
calculate_hash
to return. This is as you can have hash-collisions.find_duplicates
should be split into more functions.- You should make a
files_under
function that finds all the files under a specific directory. - You should make a
filter_hidden
function, so you can change the input offind_duplicates
, rather than relying onfind_duplicates
to do everything. find_duplicates
shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.
- You should make a
Rather than using
if not recursive: break
I'd filter the input to the loop.- I wouldn't build a
groups
intermarry list. As you're doubling the amount of memory you're using for no viable gain. - I'd merge your
duplicates
andsizes_dict
code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend. - I would use
pathlib
so that you have an easier to use object, than a string.
I changed your code, however didn't implement all of it. I find sort_duplicates
to be quite confusing, and so left that as it is.
Below are my changes:
import functools
import os
import pathlib
def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()
def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]
for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name
def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]
prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue
for item in group:
new[method(item)].add(item)
prev = new
return prev
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by usingpy.path
module, as it implementspath.computehash
method. I have tried to compare performances of the customcompute_hash
function andpy.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of thenext
function infiles_under
function is very neat. In this function, you yield theroot / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead ofos.path.join
everywhere, or are there some limitations? I also searched for_CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So,_CHUNK_SIZE
must be based on something else?
â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yesroot / file_name
is a high-level alturnate toos.path.join
. As far as I know there are no limitations. I useddir(file)
, and noticed a_CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
â Peilonrayz
Jan 18 at 10:41
making the search recursive or not withpathlib.Path
is even simpler.
â Maarten Fabré
Mar 2 at 14:19
add a comment |Â
up vote
5
down vote
Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates
function. This one seems a bit convoluted and messy.
More specifically:
- The list comprehension will throw an Error if the list comes back empty (
[0]
->IndexError
). As a result, checking forif file
does not make sense. It would have already failed. - An entire list comprehension is not needed if you just want the first item. Use
next
instead; it would be much faster and can provide you with a default value in case nothing is found. - The
if
andelse
blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check. - In the now meaningless
else
there is agroup.pop(file)
whenfile
does not exist. What's up with that? - The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.
So I would re-write as:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
I will be updating my answer if I see anything else worth pointing out.
Based on your comments below, we can make further arrangements to code in a fall-back scenario:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete
Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If thelist
comes back empty your code raises an Error... There is no falling back to the previousfile
value.
â Ev. Kounis
Jan 17 at 16:19
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
 |Â
show 2 more comments
up vote
2
down vote
I didn't analyze the whole program carefully, so this review is not complete from my point of view.
Use the Right Library Calls
If you read/write files be carefully that you do it in the right way. You can open
a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn
and a file containing rnn
contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (n
is the newline-character and r
is the carriage return character.
Improve your Algorithm
Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.
If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek
method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.
Don't Use Magic Constants
You use the magic string constant 'cop'
somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True
) where this global variable for 'cop'
is used.
There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:
4098 Byte = 4 KByte + 2 Byte
Are you sure that 4098 is the value you want to use?
In your special case it seems that you have a special treatment for path names that contain /copied/
but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.
Different Names for Different Variables
I also don't like a statement where the same variable name has two different meanings like in
file = [file for file in group if ('cop') not in file][0]
file
on the right of the =
expreession is a different variable then on the left of =
. Better write
file = [f for f in group if ('cop') not in f][0]
or something similar, this makes it easier to understand.
Don't Use deepcopy
Automatically
You use copy.deepcopy
. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like
group.remove(file)
to_delete.extend(group)
and
to_keep.append(group.pop(file))
to_delete.extend(group)
to
g = copy.copy(group)
g.remove(file)
to_delete.extend(g)
and
g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)
or you replace
duplicates = copy.deepcopy(duplicates)
by
duplicates = [set(group) for group in duplicates]
It depends on the types of objects if deepcopy
makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy
point to the same object:
>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992
But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.
Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related
Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates
. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop'
thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
add a comment |Â
3 Answers
3
active
oldest
votes
3 Answers
3
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
6
down vote
accepted
I think you should make a
chunk_file
function, so your code is more DRY.This would mean splitting
calculate_hash
andchunk_file
into two.I think you should be able to pass what type of hash you want
calculate_hash
to return. This is as you can have hash-collisions.find_duplicates
should be split into more functions.- You should make a
files_under
function that finds all the files under a specific directory. - You should make a
filter_hidden
function, so you can change the input offind_duplicates
, rather than relying onfind_duplicates
to do everything. find_duplicates
shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.
- You should make a
Rather than using
if not recursive: break
I'd filter the input to the loop.- I wouldn't build a
groups
intermarry list. As you're doubling the amount of memory you're using for no viable gain. - I'd merge your
duplicates
andsizes_dict
code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend. - I would use
pathlib
so that you have an easier to use object, than a string.
I changed your code, however didn't implement all of it. I find sort_duplicates
to be quite confusing, and so left that as it is.
Below are my changes:
import functools
import os
import pathlib
def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()
def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]
for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name
def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]
prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue
for item in group:
new[method(item)].add(item)
prev = new
return prev
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by usingpy.path
module, as it implementspath.computehash
method. I have tried to compare performances of the customcompute_hash
function andpy.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of thenext
function infiles_under
function is very neat. In this function, you yield theroot / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead ofos.path.join
everywhere, or are there some limitations? I also searched for_CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So,_CHUNK_SIZE
must be based on something else?
â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yesroot / file_name
is a high-level alturnate toos.path.join
. As far as I know there are no limitations. I useddir(file)
, and noticed a_CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
â Peilonrayz
Jan 18 at 10:41
making the search recursive or not withpathlib.Path
is even simpler.
â Maarten Fabré
Mar 2 at 14:19
add a comment |Â
up vote
6
down vote
accepted
I think you should make a
chunk_file
function, so your code is more DRY.This would mean splitting
calculate_hash
andchunk_file
into two.I think you should be able to pass what type of hash you want
calculate_hash
to return. This is as you can have hash-collisions.find_duplicates
should be split into more functions.- You should make a
files_under
function that finds all the files under a specific directory. - You should make a
filter_hidden
function, so you can change the input offind_duplicates
, rather than relying onfind_duplicates
to do everything. find_duplicates
shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.
- You should make a
Rather than using
if not recursive: break
I'd filter the input to the loop.- I wouldn't build a
groups
intermarry list. As you're doubling the amount of memory you're using for no viable gain. - I'd merge your
duplicates
andsizes_dict
code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend. - I would use
pathlib
so that you have an easier to use object, than a string.
I changed your code, however didn't implement all of it. I find sort_duplicates
to be quite confusing, and so left that as it is.
Below are my changes:
import functools
import os
import pathlib
def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()
def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]
for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name
def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]
prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue
for item in group:
new[method(item)].add(item)
prev = new
return prev
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by usingpy.path
module, as it implementspath.computehash
method. I have tried to compare performances of the customcompute_hash
function andpy.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of thenext
function infiles_under
function is very neat. In this function, you yield theroot / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead ofos.path.join
everywhere, or are there some limitations? I also searched for_CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So,_CHUNK_SIZE
must be based on something else?
â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yesroot / file_name
is a high-level alturnate toos.path.join
. As far as I know there are no limitations. I useddir(file)
, and noticed a_CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
â Peilonrayz
Jan 18 at 10:41
making the search recursive or not withpathlib.Path
is even simpler.
â Maarten Fabré
Mar 2 at 14:19
add a comment |Â
up vote
6
down vote
accepted
up vote
6
down vote
accepted
I think you should make a
chunk_file
function, so your code is more DRY.This would mean splitting
calculate_hash
andchunk_file
into two.I think you should be able to pass what type of hash you want
calculate_hash
to return. This is as you can have hash-collisions.find_duplicates
should be split into more functions.- You should make a
files_under
function that finds all the files under a specific directory. - You should make a
filter_hidden
function, so you can change the input offind_duplicates
, rather than relying onfind_duplicates
to do everything. find_duplicates
shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.
- You should make a
Rather than using
if not recursive: break
I'd filter the input to the loop.- I wouldn't build a
groups
intermarry list. As you're doubling the amount of memory you're using for no viable gain. - I'd merge your
duplicates
andsizes_dict
code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend. - I would use
pathlib
so that you have an easier to use object, than a string.
I changed your code, however didn't implement all of it. I find sort_duplicates
to be quite confusing, and so left that as it is.
Below are my changes:
import functools
import os
import pathlib
def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()
def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]
for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name
def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]
prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue
for item in group:
new[method(item)].add(item)
prev = new
return prev
I think you should make a
chunk_file
function, so your code is more DRY.This would mean splitting
calculate_hash
andchunk_file
into two.I think you should be able to pass what type of hash you want
calculate_hash
to return. This is as you can have hash-collisions.find_duplicates
should be split into more functions.- You should make a
files_under
function that finds all the files under a specific directory. - You should make a
filter_hidden
function, so you can change the input offind_duplicates
, rather than relying onfind_duplicates
to do everything. find_duplicates
shouldn't have the printing code in it. This is as printing filtered duplicates isn't about finding duplicates.
- You should make a
Rather than using
if not recursive: break
I'd filter the input to the loop.- I wouldn't build a
groups
intermarry list. As you're doubling the amount of memory you're using for no viable gain. - I'd merge your
duplicates
andsizes_dict
code into a loop. And so they are the same variable. This is as you're doing the same thing over both the things. And carrying on the way you are no is hard to extend. - I would use
pathlib
so that you have an easier to use object, than a string.
I changed your code, however didn't implement all of it. I find sort_duplicates
to be quite confusing, and so left that as it is.
Below are my changes:
import functools
import os
import pathlib
def chunk_file(file_path, chunk_size=None):
with open(file_path) as file:
if chunk_size is None:
chunk_size = file._CHUNK_SIZE
while True:
data = file.read(chunk_size)
if not data:
break
yield data
def calculate_hash(file_path, hash):
hash_object = hash()
for chunk in chunk_file(file_path):
hash_object.update(chunk)
return hash_object.hexdigest()
def files_under(directory, recursive=True):
walk = os.walk(directory):
if not recursive:
walk = [next(walk)]
for root, _, file_names in walk
root = pathlib.Path(root)
for file_name in file_names:
yield root / file_name
def filter_hidden(paths):
for path in paths:
if not path.name.startswith('.'):
yield path
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = [file for file in group if ('cop') not in file][0]
if file:
to_keep.append(file)
group.remove(file)
to_delete.extend(group)
else:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
def find_duplicates(directory, recursive=True,
exclude_hidden=False, delete=True):
methods = [
lambda path: path.stat().st_size,
functools.partial(calculate_hash, hash=hashlib.md5),
functools.partial(calculate_hash, hash=hashlib.sha256)
]
prev = None: set(paths)
for method in methods:
new = collections.defaultdict(set)
for group in prev.values():
if len(group) <= 1:
continue
for item in group:
new[method(item)].add(item)
prev = new
return prev
answered Jan 17 at 18:41
Peilonrayz
24.4k336102
24.4k336102
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by usingpy.path
module, as it implementspath.computehash
method. I have tried to compare performances of the customcompute_hash
function andpy.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of thenext
function infiles_under
function is very neat. In this function, you yield theroot / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead ofos.path.join
everywhere, or are there some limitations? I also searched for_CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So,_CHUNK_SIZE
must be based on something else?
â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yesroot / file_name
is a high-level alturnate toos.path.join
. As far as I know there are no limitations. I useddir(file)
, and noticed a_CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
â Peilonrayz
Jan 18 at 10:41
making the search recursive or not withpathlib.Path
is even simpler.
â Maarten Fabré
Mar 2 at 14:19
add a comment |Â
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by usingpy.path
module, as it implementspath.computehash
method. I have tried to compare performances of the customcompute_hash
function andpy.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.
â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of thenext
function infiles_under
function is very neat. In this function, you yield theroot / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead ofos.path.join
everywhere, or are there some limitations? I also searched for_CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So,_CHUNK_SIZE
must be based on something else?
â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yesroot / file_name
is a high-level alturnate toos.path.join
. As far as I know there are no limitations. I useddir(file)
, and noticed a_CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.
â Peilonrayz
Jan 18 at 10:41
making the search recursive or not withpathlib.Path
is even simpler.
â Maarten Fabré
Mar 2 at 14:19
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using
py.path
module, as it implements path.computehash
method. I have tried to compare performances of the custom compute_hash
function and py.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.â Ayenn Ul
Jan 18 at 8:47
Thank you very much, I have learned a lot from your answer! I was also considering using a path objects instead of a string, in particular by using
py.path
module, as it implements path.computehash
method. I have tried to compare performances of the custom compute_hash
function and py.path.computehash
method here, but I assume I made some mistake as the timing results are unreasonably large. If you have time, please, take a look.â Ayenn Ul
Jan 18 at 8:47
The way you limited the non-recursive search by using the first outcome of the
next
function in files_under
function is very neat. In this function, you yield the root / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join
everywhere, or are there some limitations? I also searched for _CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE
must be based on something else?â Ayenn Ul
Jan 18 at 10:35
The way you limited the non-recursive search by using the first outcome of the
next
function in files_under
function is very neat. In this function, you yield the root / file_name
, as far as I understand, in this way you obtain the full path of the file. Could I use this method instead of os.path.join
everywhere, or are there some limitations? I also searched for _CHUNK_SIZE
attribute in the documentation but found only chunk module. From my understanding, it deals only with certain kinds of media files. So, _CHUNK_SIZE
must be based on something else?â Ayenn Ul
Jan 18 at 10:35
@OzRamsay Thank you, :) Yes
root / file_name
is a high-level alturnate to os.path.join
. As far as I know there are no limitations. I used dir(file)
, and noticed a _CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.â Peilonrayz
Jan 18 at 10:41
@OzRamsay Thank you, :) Yes
root / file_name
is a high-level alturnate to os.path.join
. As far as I know there are no limitations. I used dir(file)
, and noticed a _CHUNK_SIZE
variable, and so thought using Python's default chunk size to be a reasonable default. I don't think it's documented, as it is also a private field.â Peilonrayz
Jan 18 at 10:41
making the search recursive or not with
pathlib.Path
is even simpler.â Maarten Fabré
Mar 2 at 14:19
making the search recursive or not with
pathlib.Path
is even simpler.â Maarten Fabré
Mar 2 at 14:19
add a comment |Â
up vote
5
down vote
Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates
function. This one seems a bit convoluted and messy.
More specifically:
- The list comprehension will throw an Error if the list comes back empty (
[0]
->IndexError
). As a result, checking forif file
does not make sense. It would have already failed. - An entire list comprehension is not needed if you just want the first item. Use
next
instead; it would be much faster and can provide you with a default value in case nothing is found. - The
if
andelse
blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check. - In the now meaningless
else
there is agroup.pop(file)
whenfile
does not exist. What's up with that? - The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.
So I would re-write as:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
I will be updating my answer if I see anything else worth pointing out.
Based on your comments below, we can make further arrangements to code in a fall-back scenario:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete
Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If thelist
comes back empty your code raises an Error... There is no falling back to the previousfile
value.
â Ev. Kounis
Jan 17 at 16:19
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
 |Â
show 2 more comments
up vote
5
down vote
Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates
function. This one seems a bit convoluted and messy.
More specifically:
- The list comprehension will throw an Error if the list comes back empty (
[0]
->IndexError
). As a result, checking forif file
does not make sense. It would have already failed. - An entire list comprehension is not needed if you just want the first item. Use
next
instead; it would be much faster and can provide you with a default value in case nothing is found. - The
if
andelse
blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check. - In the now meaningless
else
there is agroup.pop(file)
whenfile
does not exist. What's up with that? - The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.
So I would re-write as:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
I will be updating my answer if I see anything else worth pointing out.
Based on your comments below, we can make further arrangements to code in a fall-back scenario:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete
Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If thelist
comes back empty your code raises an Error... There is no falling back to the previousfile
value.
â Ev. Kounis
Jan 17 at 16:19
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
 |Â
show 2 more comments
up vote
5
down vote
up vote
5
down vote
Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates
function. This one seems a bit convoluted and messy.
More specifically:
- The list comprehension will throw an Error if the list comes back empty (
[0]
->IndexError
). As a result, checking forif file
does not make sense. It would have already failed. - An entire list comprehension is not needed if you just want the first item. Use
next
instead; it would be much faster and can provide you with a default value in case nothing is found. - The
if
andelse
blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check. - In the now meaningless
else
there is agroup.pop(file)
whenfile
does not exist. What's up with that? - The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.
So I would re-write as:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
I will be updating my answer if I see anything else worth pointing out.
Based on your comments below, we can make further arrangements to code in a fall-back scenario:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete
Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.
Your code is generally speaking nice-looking if I may say so. One thing that caught my eye is your sort_duplicates
function. This one seems a bit convoluted and messy.
More specifically:
- The list comprehension will throw an Error if the list comes back empty (
[0]
->IndexError
). As a result, checking forif file
does not make sense. It would have already failed. - An entire list comprehension is not needed if you just want the first item. Use
next
instead; it would be much faster and can provide you with a default value in case nothing is found. - The
if
andelse
blocks contain the same code, so... You do not need them. There is no reason to check for something if your decisions will not be influenced by the check. - In the now meaningless
else
there is agroup.pop(file)
whenfile
does not exist. What's up with that? - The recommended indentation is only 4 spaces. You are using 6 thus making the nested blocks look too isolated.
So I would re-write as:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
for group in duplicates:
file = next((file for file in group if 'cop' not in file), None)
if file is not None:
to_keep.append(group.pop(file))
to_delete.extend(group)
return to_keep, to_delete
I will be updating my answer if I see anything else worth pointing out.
Based on your comments below, we can make further arrangements to code in a fall-back scenario:
def sort_duplicates(duplicates):
"""
duplicates: sequence of groups of duplicates
returns two list of the duplicates divided by the rule described in
'find_duplicates'
"""
duplicates = copy.deepcopy(duplicates)
to_keep =
to_delete =
file, prev_file = '', ''
for group in duplicates:
file = next((file for file in group if 'cop' not in file), prev_file)
if file != prev_file:
to_keep.append(group.pop(file))
else:
to_keep.append(group.pop())
to_delete.extend(group)
prev_file = file
return to_keep, to_delete
Note that I cannot properly test the above and as such there might be small bugs that you would have to ammend. Howeverm the important thing for me is that you understand the logic.
edited Jan 17 at 16:34
answered Jan 17 at 10:13
Ev. Kounis
620313
620313
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If thelist
comes back empty your code raises an Error... There is no falling back to the previousfile
value.
â Ev. Kounis
Jan 17 at 16:19
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
 |Â
show 2 more comments
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If thelist
comes back empty your code raises an Error... There is no falling back to the previousfile
value.
â Ev. Kounis
Jan 17 at 16:19
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
Thank you very much for the valuable results. file = [file for file in group if ('cop') not in file][0] if file: to_keep.append(file) group.remove(file) to_delete.extend(group) else: to_keep.append(group.pop(file)) to_delete.extend(group)
â Ayenn Ul
Jan 17 at 16:02
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
@OzRamsay You are welcome. Is there a reason you posted that code in the comment?
â Ev. Kounis
Jan 17 at 16:05
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
Yes, In regards to your remark 3: I wanted to keep only one file of the group of duplicates. To choose such file I add filenames which do not contain string 'cop' (to exclude filenames such as 'copy1' or 'copied') to the list 'files'. If the list is not empty I choose an arbitrary name from the list (it happens to be the first one). If there are no such names (i.e. all names have the string 'cop') I can choose any name from the entire group. In this case, it will be the last processed filename which is stored as variable 'file' from the previous list comprehension. How do you think about that?
â Ayenn Ul
Jan 17 at 16:12
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the
list
comes back empty your code raises an Error... There is no falling back to the previous file
value.â Ev. Kounis
Jan 17 at 16:19
@OzRamsay Well, I cannot comment on the idea because that is just what you want to do. The how is where I come in and I can tell you that the code you had did not do that. If the
list
comes back empty your code raises an Error... There is no falling back to the previous file
value.â Ev. Kounis
Jan 17 at 16:19
1
1
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
@OzRamsay Take a look! This might make it even clearer.
â Ev. Kounis
Jan 17 at 16:25
 |Â
show 2 more comments
up vote
2
down vote
I didn't analyze the whole program carefully, so this review is not complete from my point of view.
Use the Right Library Calls
If you read/write files be carefully that you do it in the right way. You can open
a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn
and a file containing rnn
contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (n
is the newline-character and r
is the carriage return character.
Improve your Algorithm
Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.
If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek
method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.
Don't Use Magic Constants
You use the magic string constant 'cop'
somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True
) where this global variable for 'cop'
is used.
There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:
4098 Byte = 4 KByte + 2 Byte
Are you sure that 4098 is the value you want to use?
In your special case it seems that you have a special treatment for path names that contain /copied/
but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.
Different Names for Different Variables
I also don't like a statement where the same variable name has two different meanings like in
file = [file for file in group if ('cop') not in file][0]
file
on the right of the =
expreession is a different variable then on the left of =
. Better write
file = [f for f in group if ('cop') not in f][0]
or something similar, this makes it easier to understand.
Don't Use deepcopy
Automatically
You use copy.deepcopy
. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like
group.remove(file)
to_delete.extend(group)
and
to_keep.append(group.pop(file))
to_delete.extend(group)
to
g = copy.copy(group)
g.remove(file)
to_delete.extend(g)
and
g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)
or you replace
duplicates = copy.deepcopy(duplicates)
by
duplicates = [set(group) for group in duplicates]
It depends on the types of objects if deepcopy
makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy
point to the same object:
>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992
But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.
Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related
Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates
. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop'
thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
add a comment |Â
up vote
2
down vote
I didn't analyze the whole program carefully, so this review is not complete from my point of view.
Use the Right Library Calls
If you read/write files be carefully that you do it in the right way. You can open
a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn
and a file containing rnn
contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (n
is the newline-character and r
is the carriage return character.
Improve your Algorithm
Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.
If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek
method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.
Don't Use Magic Constants
You use the magic string constant 'cop'
somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True
) where this global variable for 'cop'
is used.
There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:
4098 Byte = 4 KByte + 2 Byte
Are you sure that 4098 is the value you want to use?
In your special case it seems that you have a special treatment for path names that contain /copied/
but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.
Different Names for Different Variables
I also don't like a statement where the same variable name has two different meanings like in
file = [file for file in group if ('cop') not in file][0]
file
on the right of the =
expreession is a different variable then on the left of =
. Better write
file = [f for f in group if ('cop') not in f][0]
or something similar, this makes it easier to understand.
Don't Use deepcopy
Automatically
You use copy.deepcopy
. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like
group.remove(file)
to_delete.extend(group)
and
to_keep.append(group.pop(file))
to_delete.extend(group)
to
g = copy.copy(group)
g.remove(file)
to_delete.extend(g)
and
g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)
or you replace
duplicates = copy.deepcopy(duplicates)
by
duplicates = [set(group) for group in duplicates]
It depends on the types of objects if deepcopy
makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy
point to the same object:
>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992
But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.
Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related
Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates
. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop'
thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
add a comment |Â
up vote
2
down vote
up vote
2
down vote
I didn't analyze the whole program carefully, so this review is not complete from my point of view.
Use the Right Library Calls
If you read/write files be carefully that you do it in the right way. You can open
a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn
and a file containing rnn
contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (n
is the newline-character and r
is the carriage return character.
Improve your Algorithm
Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.
If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek
method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.
Don't Use Magic Constants
You use the magic string constant 'cop'
somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True
) where this global variable for 'cop'
is used.
There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:
4098 Byte = 4 KByte + 2 Byte
Are you sure that 4098 is the value you want to use?
In your special case it seems that you have a special treatment for path names that contain /copied/
but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.
Different Names for Different Variables
I also don't like a statement where the same variable name has two different meanings like in
file = [file for file in group if ('cop') not in file][0]
file
on the right of the =
expreession is a different variable then on the left of =
. Better write
file = [f for f in group if ('cop') not in f][0]
or something similar, this makes it easier to understand.
Don't Use deepcopy
Automatically
You use copy.deepcopy
. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like
group.remove(file)
to_delete.extend(group)
and
to_keep.append(group.pop(file))
to_delete.extend(group)
to
g = copy.copy(group)
g.remove(file)
to_delete.extend(g)
and
g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)
or you replace
duplicates = copy.deepcopy(duplicates)
by
duplicates = [set(group) for group in duplicates]
It depends on the types of objects if deepcopy
makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy
point to the same object:
>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992
But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.
Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related
Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates
. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop'
thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.
I didn't analyze the whole program carefully, so this review is not complete from my point of view.
Use the Right Library Calls
If you read/write files be carefully that you do it in the right way. You can open
a file in text mode or binary mode. If you compare arbitrary files then binary mode may be the correct one, check the Python Tutorial. If text mode is the wrong one, which test case could demonstrate this? Maybe a file containing nrn
and a file containing rnn
contain both three bytes, are different in byte mode and maybe the same in text mode, because they both contain two empty lines, but I did not check this (n
is the newline-character and r
is the carriage return character.
Improve your Algorithm
Your program could do the search for duplicate files more efficiently. A file can only be the duplicate of another file if both files have the same length. The length of a file can be easily retrieved from the operating system. So first get the name and size of every file you want to investigate and group them by their sizes. Only if a group contains more then one file you have investigate this group further. This can help to avoid the reading of a lot of large files.
If you have two large file of equal size and large parts of them are not equal then if you compare the file at some random positions then there is a high probability that they differ at that positions. If they differ then you have avoided to read the whole file. The seek
method changes the stream position. Only if the files are identical on these random positions you continue with comparing the hash of the whole files.
Don't Use Magic Constants
You use the magic string constant 'cop'
somewhere in your code. You shouldn't use such magical constant. If you need them than define a global variable, choose an appropriate name and maybe write an explanation in the comment. Or if it is not really a constant then you should supply it as argument of the function. In your case I think either it should be supplied as argument or you define some kind of test mode for you program (e.g. by setting a global variable TEST_MODE=True
) where this global variable for 'cop'
is used.
There is another magic constant in your code: 4098. Again you should define a global variable with a descriptive name. We have:
4098 Byte = 4 KByte + 2 Byte
Are you sure that 4098 is the value you want to use?
In your special case it seems that you have a special treatment for path names that contain /copied/
but you didn't program that. I think you should (almost) always try to program your code so that it actually does what you want. This makes it simpler to understand what you do.
Different Names for Different Variables
I also don't like a statement where the same variable name has two different meanings like in
file = [file for file in group if ('cop') not in file][0]
file
on the right of the =
expreession is a different variable then on the left of =
. Better write
file = [f for f in group if ('cop') not in f][0]
or something similar, this makes it easier to understand.
Don't Use deepcopy
Automatically
You use copy.deepcopy
. This may be more deep than necessary. You actually only modify the sets, so it will be sufficient to change lines like
group.remove(file)
to_delete.extend(group)
and
to_keep.append(group.pop(file))
to_delete.extend(group)
to
g = copy.copy(group)
g.remove(file)
to_delete.extend(g)
and
g = copy.copy(group)
to_keep.append(g.pop(file))
to_delete.extend(g)
or you replace
duplicates = copy.deepcopy(duplicates)
by
duplicates = [set(group) for group in duplicates]
It depends on the types of objects if deepcopy
makes actually a copy of the objects, too. For your algorithm a copy of the object is not necessary. In case of strings it seems to be depend on the Python implementation if the are copies of strings made (see this stackoverflow answer). The Python implementation on my notebook does not make copies of the string but the strings of the deepcopy
point to the same object:
>>> x = 'abc'
>>> y = copy.deepcopy(x)
>>> print(id(x), id(y))
50516992 50516992
But if you use your functions for list of sets of mutable objects then copies of these objects will be made and this is not necessary and may slow down a program. But I admin that in your case where you read a lot of files from the disk this will not have an influence of the overall performance of the program.
Don't Relate the Description and Names of a Function and its Variables to Code that isn't Related
Ev.Kounis already showed you some serious problems of sort_duplicate function.
I will concentrate on non technical aspects. I dont like the naming and description of the sort_duplicates
. You are talking about files and duplicate but this function is not about files and duplicates, but it is about list of sets of strings and if you wouldn't have this 'cop'
thing then it would be about a list of sets arbitrary objects and not even about strings. And you want to choose one representative of each set and return a list of these chosen representatives and a list of the remaining group elements. There is nothing about duplicates. And so I would change your naming and description. This makes it easier for a reader to understand the function and makes it also easier to reuse these function in another context or use it in a library.
edited Mar 2 at 16:24
answered Mar 2 at 9:49
miracle173
1,014514
1,014514
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
add a comment |Â
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
edited the post: some notes about the constant 4098 used in the code
â miracle173
Mar 2 at 11:25
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f185244%2ffinding-and-deleting-duplicate-files-in-a-folder%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
You said you left out the CLI, therefore you should explain how to call your code. Having a test is great for this, so everything should be fine by now.
â Roland Illig
Jan 16 at 19:18
If the files are very large, you might speed it up by checking for the file sizes and the content of the first 100 bytes
â Maarten Fabré
Mar 2 at 14:23
Duplicate Files Deleter Program is superb to find and delete duplicate files from your computer. It searches throughout your computer, create a list of duplicate files and present them before you for deciding what you want to do with these files. A great way to free space of your disks. If you will try you will love it.
â Jaydeb Nilson
Apr 14 at 23:01