Select and delete objects from S3 container
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
6
down vote
favorite
GitLab saves files with the pattern 1530410429_2018_07_01_11.0.1_gitlab_backup.tar
(epoch, date, version, fixed string) to an S3 container.
I wanted to create a Python module to manage those backups, particularly to delete old backups, but preserving a specified number of files for each version).
To this end I:
- read S3 bucket contents and populate a list of dictionaries containing file name and an extracted version
- extract a set of versions from the above list
- iterate over each version and create a list of files to delete
- iterate over the above result and delete the files from the bucket
It works, but I feel the data structures I used are not appropriate and not pythonic.
How could I simplify the data structures and possibly minimize the number of function calls.
import boto3
def get_set_of_versions(table_of_backups):
set_of_versions = set()
for backup in table_of_backups:
set_of_versions.add(backup['gitlab_version'])
return set_of_versions
def create_table_of_backups(list_of_backup_files):
table_of_backups = [
"s3_object_name": filename,
"gitlab_version": filename.split('_')[4]
for filename in list_of_backup_files
]
return table_of_backups
def extract_list_of_backups(list_of_objects):
list_of_backups = [s3_object.key for s3_object in list_of_objects if 'gitlab_backup' in s3_object.key]
return list_of_backups
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
self.list_of_backup_files = extract_list_of_backups(self.bucket.objects.all())
self.table_of_backups = create_table_of_backups(self.list_of_backup_files)
self.set_of_versions = get_set_of_versions(self.table_of_backups)
def return_candidates_for_deletion_within_version(self, version, generations_to_keep=1):
if generations_to_keep == 0:
raise ValueError
list_of_versioned_backups = [
backup for backup in self.table_of_backups if backup['gitlab_version'] == version
]
return [backup['s3_object_name'] for backup in list_of_versioned_backups[:-generations_to_keep]]
def return_candidates_for_deletion(self, generations_to_keep=2):
resulting_list = list()
for version in self.set_of_versions:
resulting_list.extend(self.return_candidates_for_deletion_within_version(version, generations_to_keep))
return resulting_list
def delete_old_backups(self, generations_to_keep=2):
list_to_delete = self.return_candidates_for_deletion(generations_to_keep)
for key in list_to_delete:
self.s3.Object(self.bucket_name, key).delete()
python amazon-web-services
add a comment |Â
up vote
6
down vote
favorite
GitLab saves files with the pattern 1530410429_2018_07_01_11.0.1_gitlab_backup.tar
(epoch, date, version, fixed string) to an S3 container.
I wanted to create a Python module to manage those backups, particularly to delete old backups, but preserving a specified number of files for each version).
To this end I:
- read S3 bucket contents and populate a list of dictionaries containing file name and an extracted version
- extract a set of versions from the above list
- iterate over each version and create a list of files to delete
- iterate over the above result and delete the files from the bucket
It works, but I feel the data structures I used are not appropriate and not pythonic.
How could I simplify the data structures and possibly minimize the number of function calls.
import boto3
def get_set_of_versions(table_of_backups):
set_of_versions = set()
for backup in table_of_backups:
set_of_versions.add(backup['gitlab_version'])
return set_of_versions
def create_table_of_backups(list_of_backup_files):
table_of_backups = [
"s3_object_name": filename,
"gitlab_version": filename.split('_')[4]
for filename in list_of_backup_files
]
return table_of_backups
def extract_list_of_backups(list_of_objects):
list_of_backups = [s3_object.key for s3_object in list_of_objects if 'gitlab_backup' in s3_object.key]
return list_of_backups
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
self.list_of_backup_files = extract_list_of_backups(self.bucket.objects.all())
self.table_of_backups = create_table_of_backups(self.list_of_backup_files)
self.set_of_versions = get_set_of_versions(self.table_of_backups)
def return_candidates_for_deletion_within_version(self, version, generations_to_keep=1):
if generations_to_keep == 0:
raise ValueError
list_of_versioned_backups = [
backup for backup in self.table_of_backups if backup['gitlab_version'] == version
]
return [backup['s3_object_name'] for backup in list_of_versioned_backups[:-generations_to_keep]]
def return_candidates_for_deletion(self, generations_to_keep=2):
resulting_list = list()
for version in self.set_of_versions:
resulting_list.extend(self.return_candidates_for_deletion_within_version(version, generations_to_keep))
return resulting_list
def delete_old_backups(self, generations_to_keep=2):
list_to_delete = self.return_candidates_for_deletion(generations_to_keep)
for key in list_to_delete:
self.s3.Object(self.bucket_name, key).delete()
python amazon-web-services
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20
add a comment |Â
up vote
6
down vote
favorite
up vote
6
down vote
favorite
GitLab saves files with the pattern 1530410429_2018_07_01_11.0.1_gitlab_backup.tar
(epoch, date, version, fixed string) to an S3 container.
I wanted to create a Python module to manage those backups, particularly to delete old backups, but preserving a specified number of files for each version).
To this end I:
- read S3 bucket contents and populate a list of dictionaries containing file name and an extracted version
- extract a set of versions from the above list
- iterate over each version and create a list of files to delete
- iterate over the above result and delete the files from the bucket
It works, but I feel the data structures I used are not appropriate and not pythonic.
How could I simplify the data structures and possibly minimize the number of function calls.
import boto3
def get_set_of_versions(table_of_backups):
set_of_versions = set()
for backup in table_of_backups:
set_of_versions.add(backup['gitlab_version'])
return set_of_versions
def create_table_of_backups(list_of_backup_files):
table_of_backups = [
"s3_object_name": filename,
"gitlab_version": filename.split('_')[4]
for filename in list_of_backup_files
]
return table_of_backups
def extract_list_of_backups(list_of_objects):
list_of_backups = [s3_object.key for s3_object in list_of_objects if 'gitlab_backup' in s3_object.key]
return list_of_backups
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
self.list_of_backup_files = extract_list_of_backups(self.bucket.objects.all())
self.table_of_backups = create_table_of_backups(self.list_of_backup_files)
self.set_of_versions = get_set_of_versions(self.table_of_backups)
def return_candidates_for_deletion_within_version(self, version, generations_to_keep=1):
if generations_to_keep == 0:
raise ValueError
list_of_versioned_backups = [
backup for backup in self.table_of_backups if backup['gitlab_version'] == version
]
return [backup['s3_object_name'] for backup in list_of_versioned_backups[:-generations_to_keep]]
def return_candidates_for_deletion(self, generations_to_keep=2):
resulting_list = list()
for version in self.set_of_versions:
resulting_list.extend(self.return_candidates_for_deletion_within_version(version, generations_to_keep))
return resulting_list
def delete_old_backups(self, generations_to_keep=2):
list_to_delete = self.return_candidates_for_deletion(generations_to_keep)
for key in list_to_delete:
self.s3.Object(self.bucket_name, key).delete()
python amazon-web-services
GitLab saves files with the pattern 1530410429_2018_07_01_11.0.1_gitlab_backup.tar
(epoch, date, version, fixed string) to an S3 container.
I wanted to create a Python module to manage those backups, particularly to delete old backups, but preserving a specified number of files for each version).
To this end I:
- read S3 bucket contents and populate a list of dictionaries containing file name and an extracted version
- extract a set of versions from the above list
- iterate over each version and create a list of files to delete
- iterate over the above result and delete the files from the bucket
It works, but I feel the data structures I used are not appropriate and not pythonic.
How could I simplify the data structures and possibly minimize the number of function calls.
import boto3
def get_set_of_versions(table_of_backups):
set_of_versions = set()
for backup in table_of_backups:
set_of_versions.add(backup['gitlab_version'])
return set_of_versions
def create_table_of_backups(list_of_backup_files):
table_of_backups = [
"s3_object_name": filename,
"gitlab_version": filename.split('_')[4]
for filename in list_of_backup_files
]
return table_of_backups
def extract_list_of_backups(list_of_objects):
list_of_backups = [s3_object.key for s3_object in list_of_objects if 'gitlab_backup' in s3_object.key]
return list_of_backups
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
self.list_of_backup_files = extract_list_of_backups(self.bucket.objects.all())
self.table_of_backups = create_table_of_backups(self.list_of_backup_files)
self.set_of_versions = get_set_of_versions(self.table_of_backups)
def return_candidates_for_deletion_within_version(self, version, generations_to_keep=1):
if generations_to_keep == 0:
raise ValueError
list_of_versioned_backups = [
backup for backup in self.table_of_backups if backup['gitlab_version'] == version
]
return [backup['s3_object_name'] for backup in list_of_versioned_backups[:-generations_to_keep]]
def return_candidates_for_deletion(self, generations_to_keep=2):
resulting_list = list()
for version in self.set_of_versions:
resulting_list.extend(self.return_candidates_for_deletion_within_version(version, generations_to_keep))
return resulting_list
def delete_old_backups(self, generations_to_keep=2):
list_to_delete = self.return_candidates_for_deletion(generations_to_keep)
for key in list_to_delete:
self.s3.Object(self.bucket_name, key).delete()
python amazon-web-services
edited Jul 16 at 7:20
asked Jul 15 at 12:46
asylumine
336
336
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20
add a comment |Â
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
4
down vote
accepted
I think your function names could use some work. There is no need to specify that return_candidates_for_deletion
actually returns those candidates (it is obvious). It is also a bit superfluous to know that extract_list_of_backups
returns a list
. It is enough to know that it returns a collection that is iterable (whether or not that is a tuple
, list
, set
or even just a generator does not really matter for the rest of the code).
In addition, you can save a lot of code by changing your internal data structure a bit. The main thing your code does is sort the backups by the gitlab version and then keep only some of them. So you should store the files in a dictionary with the gitlab version as the key:
import boto3
from collections import defaultdict
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
def backups(self):
return (obj.key
for obj in self.bucket.objects.all()
if 'gitlab_backup' in obj.key)
def group_by_version(self, files):
backups = defaultdict(list)
for file_name in files:
gitlab_version = file_name.split('_')[4]
backups[gitlab_version].append(file_name)
return backups
def candidates_for_deletion(self, generations_to_keep=2):
for files in self.group_by_version(self.backups()).values():
# files.sort() # is this not needed?
yield from files[:-generations_to_keep]
def delete_old_backups(self, generations_to_keep=2):
for key in self.candidates_for_deletion(generations_to_keep):
self.s3.Object(self.bucket_name, key).delete()
Here I put all the related standalone functions into the class. It has now one generator that yields all backup files, one that creates the dictionary with the versions as keys, one generator that yields all files except for the generation to keep for each version and then the actual deleting method.
Note that with this, as with your code, it is not guaranteed that the latest generation is actually at the end of the list of files. It depends on the order of self.bucket.objects.all()
, so on how S3 returns it (which is apparently alphabetical). But you might want to make sure from your side as well and sort each files
list in candidates_for_deletion
.
In order to make this a bit more reusable, you could then pull out a groupby
method that is similar to itertools.groupby
but different because it consumes the whole input and therefore does not require the input to be sorted:
def groupby(objects, key=lambda x: x):
out = defaultdict(list)
for obj in objects:
out[key(obj)].append(obj)
return out
class GitlabBackups:
...
@staticmethod
def gitlab_version(file_name):
return file_name.split('_')[4]
def candidates_for_deletion(self, generations_to_keep=2):
for files in groupby(self.backups(), key=self.gitlab_version).values():
yield from files[:-generations_to_keep]
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
4
down vote
accepted
I think your function names could use some work. There is no need to specify that return_candidates_for_deletion
actually returns those candidates (it is obvious). It is also a bit superfluous to know that extract_list_of_backups
returns a list
. It is enough to know that it returns a collection that is iterable (whether or not that is a tuple
, list
, set
or even just a generator does not really matter for the rest of the code).
In addition, you can save a lot of code by changing your internal data structure a bit. The main thing your code does is sort the backups by the gitlab version and then keep only some of them. So you should store the files in a dictionary with the gitlab version as the key:
import boto3
from collections import defaultdict
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
def backups(self):
return (obj.key
for obj in self.bucket.objects.all()
if 'gitlab_backup' in obj.key)
def group_by_version(self, files):
backups = defaultdict(list)
for file_name in files:
gitlab_version = file_name.split('_')[4]
backups[gitlab_version].append(file_name)
return backups
def candidates_for_deletion(self, generations_to_keep=2):
for files in self.group_by_version(self.backups()).values():
# files.sort() # is this not needed?
yield from files[:-generations_to_keep]
def delete_old_backups(self, generations_to_keep=2):
for key in self.candidates_for_deletion(generations_to_keep):
self.s3.Object(self.bucket_name, key).delete()
Here I put all the related standalone functions into the class. It has now one generator that yields all backup files, one that creates the dictionary with the versions as keys, one generator that yields all files except for the generation to keep for each version and then the actual deleting method.
Note that with this, as with your code, it is not guaranteed that the latest generation is actually at the end of the list of files. It depends on the order of self.bucket.objects.all()
, so on how S3 returns it (which is apparently alphabetical). But you might want to make sure from your side as well and sort each files
list in candidates_for_deletion
.
In order to make this a bit more reusable, you could then pull out a groupby
method that is similar to itertools.groupby
but different because it consumes the whole input and therefore does not require the input to be sorted:
def groupby(objects, key=lambda x: x):
out = defaultdict(list)
for obj in objects:
out[key(obj)].append(obj)
return out
class GitlabBackups:
...
@staticmethod
def gitlab_version(file_name):
return file_name.split('_')[4]
def candidates_for_deletion(self, generations_to_keep=2):
for files in groupby(self.backups(), key=self.gitlab_version).values():
yield from files[:-generations_to_keep]
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
add a comment |Â
up vote
4
down vote
accepted
I think your function names could use some work. There is no need to specify that return_candidates_for_deletion
actually returns those candidates (it is obvious). It is also a bit superfluous to know that extract_list_of_backups
returns a list
. It is enough to know that it returns a collection that is iterable (whether or not that is a tuple
, list
, set
or even just a generator does not really matter for the rest of the code).
In addition, you can save a lot of code by changing your internal data structure a bit. The main thing your code does is sort the backups by the gitlab version and then keep only some of them. So you should store the files in a dictionary with the gitlab version as the key:
import boto3
from collections import defaultdict
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
def backups(self):
return (obj.key
for obj in self.bucket.objects.all()
if 'gitlab_backup' in obj.key)
def group_by_version(self, files):
backups = defaultdict(list)
for file_name in files:
gitlab_version = file_name.split('_')[4]
backups[gitlab_version].append(file_name)
return backups
def candidates_for_deletion(self, generations_to_keep=2):
for files in self.group_by_version(self.backups()).values():
# files.sort() # is this not needed?
yield from files[:-generations_to_keep]
def delete_old_backups(self, generations_to_keep=2):
for key in self.candidates_for_deletion(generations_to_keep):
self.s3.Object(self.bucket_name, key).delete()
Here I put all the related standalone functions into the class. It has now one generator that yields all backup files, one that creates the dictionary with the versions as keys, one generator that yields all files except for the generation to keep for each version and then the actual deleting method.
Note that with this, as with your code, it is not guaranteed that the latest generation is actually at the end of the list of files. It depends on the order of self.bucket.objects.all()
, so on how S3 returns it (which is apparently alphabetical). But you might want to make sure from your side as well and sort each files
list in candidates_for_deletion
.
In order to make this a bit more reusable, you could then pull out a groupby
method that is similar to itertools.groupby
but different because it consumes the whole input and therefore does not require the input to be sorted:
def groupby(objects, key=lambda x: x):
out = defaultdict(list)
for obj in objects:
out[key(obj)].append(obj)
return out
class GitlabBackups:
...
@staticmethod
def gitlab_version(file_name):
return file_name.split('_')[4]
def candidates_for_deletion(self, generations_to_keep=2):
for files in groupby(self.backups(), key=self.gitlab_version).values():
yield from files[:-generations_to_keep]
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
add a comment |Â
up vote
4
down vote
accepted
up vote
4
down vote
accepted
I think your function names could use some work. There is no need to specify that return_candidates_for_deletion
actually returns those candidates (it is obvious). It is also a bit superfluous to know that extract_list_of_backups
returns a list
. It is enough to know that it returns a collection that is iterable (whether or not that is a tuple
, list
, set
or even just a generator does not really matter for the rest of the code).
In addition, you can save a lot of code by changing your internal data structure a bit. The main thing your code does is sort the backups by the gitlab version and then keep only some of them. So you should store the files in a dictionary with the gitlab version as the key:
import boto3
from collections import defaultdict
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
def backups(self):
return (obj.key
for obj in self.bucket.objects.all()
if 'gitlab_backup' in obj.key)
def group_by_version(self, files):
backups = defaultdict(list)
for file_name in files:
gitlab_version = file_name.split('_')[4]
backups[gitlab_version].append(file_name)
return backups
def candidates_for_deletion(self, generations_to_keep=2):
for files in self.group_by_version(self.backups()).values():
# files.sort() # is this not needed?
yield from files[:-generations_to_keep]
def delete_old_backups(self, generations_to_keep=2):
for key in self.candidates_for_deletion(generations_to_keep):
self.s3.Object(self.bucket_name, key).delete()
Here I put all the related standalone functions into the class. It has now one generator that yields all backup files, one that creates the dictionary with the versions as keys, one generator that yields all files except for the generation to keep for each version and then the actual deleting method.
Note that with this, as with your code, it is not guaranteed that the latest generation is actually at the end of the list of files. It depends on the order of self.bucket.objects.all()
, so on how S3 returns it (which is apparently alphabetical). But you might want to make sure from your side as well and sort each files
list in candidates_for_deletion
.
In order to make this a bit more reusable, you could then pull out a groupby
method that is similar to itertools.groupby
but different because it consumes the whole input and therefore does not require the input to be sorted:
def groupby(objects, key=lambda x: x):
out = defaultdict(list)
for obj in objects:
out[key(obj)].append(obj)
return out
class GitlabBackups:
...
@staticmethod
def gitlab_version(file_name):
return file_name.split('_')[4]
def candidates_for_deletion(self, generations_to_keep=2):
for files in groupby(self.backups(), key=self.gitlab_version).values():
yield from files[:-generations_to_keep]
I think your function names could use some work. There is no need to specify that return_candidates_for_deletion
actually returns those candidates (it is obvious). It is also a bit superfluous to know that extract_list_of_backups
returns a list
. It is enough to know that it returns a collection that is iterable (whether or not that is a tuple
, list
, set
or even just a generator does not really matter for the rest of the code).
In addition, you can save a lot of code by changing your internal data structure a bit. The main thing your code does is sort the backups by the gitlab version and then keep only some of them. So you should store the files in a dictionary with the gitlab version as the key:
import boto3
from collections import defaultdict
class GitlabBackups:
def __init__(self, bucket_name):
self.bucket_name = bucket_name
self.s3 = boto3.resource('s3')
self.bucket = self.s3.Bucket(bucket_name)
def backups(self):
return (obj.key
for obj in self.bucket.objects.all()
if 'gitlab_backup' in obj.key)
def group_by_version(self, files):
backups = defaultdict(list)
for file_name in files:
gitlab_version = file_name.split('_')[4]
backups[gitlab_version].append(file_name)
return backups
def candidates_for_deletion(self, generations_to_keep=2):
for files in self.group_by_version(self.backups()).values():
# files.sort() # is this not needed?
yield from files[:-generations_to_keep]
def delete_old_backups(self, generations_to_keep=2):
for key in self.candidates_for_deletion(generations_to_keep):
self.s3.Object(self.bucket_name, key).delete()
Here I put all the related standalone functions into the class. It has now one generator that yields all backup files, one that creates the dictionary with the versions as keys, one generator that yields all files except for the generation to keep for each version and then the actual deleting method.
Note that with this, as with your code, it is not guaranteed that the latest generation is actually at the end of the list of files. It depends on the order of self.bucket.objects.all()
, so on how S3 returns it (which is apparently alphabetical). But you might want to make sure from your side as well and sort each files
list in candidates_for_deletion
.
In order to make this a bit more reusable, you could then pull out a groupby
method that is similar to itertools.groupby
but different because it consumes the whole input and therefore does not require the input to be sorted:
def groupby(objects, key=lambda x: x):
out = defaultdict(list)
for obj in objects:
out[key(obj)].append(obj)
return out
class GitlabBackups:
...
@staticmethod
def gitlab_version(file_name):
return file_name.split('_')[4]
def candidates_for_deletion(self, generations_to_keep=2):
for files in groupby(self.backups(), key=self.gitlab_version).values():
yield from files[:-generations_to_keep]
edited Jul 15 at 16:03
answered Jul 15 at 14:17
Graipher
20.4k42981
20.4k42981
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
add a comment |Â
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
1
1
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
Thank you. It is guaranteed that the last generation is at the end, because S3 objects are returned in alphabetical order of their names and in this case they start with epoch time.
â asylumine
Jul 15 at 14:59
1
1
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
I'd also added regex to parse file name. Current filename split is less readable and the whole script will break if some unfortunate admin will add a test file, which does not conform to the file name format.
â Roman Susi
Jul 15 at 18:37
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f199540%2fselect-and-delete-objects-from-s3-container%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Out of scope, but you should also consider using S3's object lifecycles, which allow you to delete old objects automatically.
â Hubert Grzeskowiak
Jul 15 at 14:36
But I don't want to delete old objects, I want to delete specific objects and preserve older objects. I don't think object lifecycles would allow me to achieve that.
â asylumine
Jul 15 at 17:02
Just in case, what about using timegaps utility (written in Python) with user-space mounting of S3 bucket in filesystem? May well be your problem can be solved without any code.
â Roman Susi
Jul 15 at 18:32
Does timegaps account for version numbers stored in file names?
â asylumine
Jul 16 at 7:20