Append to compressed tar file with performance

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
6
down vote

favorite












As you might know, Python's tarfile does not have the ability to append to files compressed with e.g. gz or bz2. I tried to implement this functionality which works fine but operates slowly. The following function accepts a string or bytes object and appends it as a file to an existing tarfile. I'm not sure if this code has the best performance, and it may have issues. I also tried writing files to memory instead of to a temporary directory, but this didn't impact performance.



import os
import tarfile
import tempfile
import time
from pathlib import Path


def append_tar_file(buffer, file_name, output_path, replace=True):
"""
append a buffer to an existing tar file
"""
# extract files
# check for existing file and overwrite if need to
# compress files
if not os.path.isfile(output_path):
return
buffer = buffer.encode("utf-8") if isinstance(buffer, str) else buffer

with tempfile.TemporaryDirectory() as tempdir:
tempdirp = Path(tempdir)
with tarfile.open(output_path, "r:bz2") as tar:
try:
tar.extractall(os.path.abspath(tempdirp))
except Exception as err: #tar file is empty
print(err)
buffer_path = os.path.join(tempdir, os.path.basename(file_name))
if replace or (buffer_path not in list(os.path.abspath(f) for f in tempdirp.iterdir())):
with open(buffer_path, "wb") as f:
f.write(buffer)

with tarfile.open(output_path, "w:bz2") as tar:
for file in tempdirp.iterdir():
try:
tar.add(file, arcname=os.path.basename(os.path.normpath(file)))
except Exception as err:
print(err)

if __name__ == "__main__":
path = "./test.tar.gz"
buffer = "Test String"
filename = "somefile"
for i in range(1, 100):
print(time.time())
append_tar_file(buffer, filename+str(i), path)






share|improve this question



























    up vote
    6
    down vote

    favorite












    As you might know, Python's tarfile does not have the ability to append to files compressed with e.g. gz or bz2. I tried to implement this functionality which works fine but operates slowly. The following function accepts a string or bytes object and appends it as a file to an existing tarfile. I'm not sure if this code has the best performance, and it may have issues. I also tried writing files to memory instead of to a temporary directory, but this didn't impact performance.



    import os
    import tarfile
    import tempfile
    import time
    from pathlib import Path


    def append_tar_file(buffer, file_name, output_path, replace=True):
    """
    append a buffer to an existing tar file
    """
    # extract files
    # check for existing file and overwrite if need to
    # compress files
    if not os.path.isfile(output_path):
    return
    buffer = buffer.encode("utf-8") if isinstance(buffer, str) else buffer

    with tempfile.TemporaryDirectory() as tempdir:
    tempdirp = Path(tempdir)
    with tarfile.open(output_path, "r:bz2") as tar:
    try:
    tar.extractall(os.path.abspath(tempdirp))
    except Exception as err: #tar file is empty
    print(err)
    buffer_path = os.path.join(tempdir, os.path.basename(file_name))
    if replace or (buffer_path not in list(os.path.abspath(f) for f in tempdirp.iterdir())):
    with open(buffer_path, "wb") as f:
    f.write(buffer)

    with tarfile.open(output_path, "w:bz2") as tar:
    for file in tempdirp.iterdir():
    try:
    tar.add(file, arcname=os.path.basename(os.path.normpath(file)))
    except Exception as err:
    print(err)

    if __name__ == "__main__":
    path = "./test.tar.gz"
    buffer = "Test String"
    filename = "somefile"
    for i in range(1, 100):
    print(time.time())
    append_tar_file(buffer, filename+str(i), path)






    share|improve this question























      up vote
      6
      down vote

      favorite









      up vote
      6
      down vote

      favorite











      As you might know, Python's tarfile does not have the ability to append to files compressed with e.g. gz or bz2. I tried to implement this functionality which works fine but operates slowly. The following function accepts a string or bytes object and appends it as a file to an existing tarfile. I'm not sure if this code has the best performance, and it may have issues. I also tried writing files to memory instead of to a temporary directory, but this didn't impact performance.



      import os
      import tarfile
      import tempfile
      import time
      from pathlib import Path


      def append_tar_file(buffer, file_name, output_path, replace=True):
      """
      append a buffer to an existing tar file
      """
      # extract files
      # check for existing file and overwrite if need to
      # compress files
      if not os.path.isfile(output_path):
      return
      buffer = buffer.encode("utf-8") if isinstance(buffer, str) else buffer

      with tempfile.TemporaryDirectory() as tempdir:
      tempdirp = Path(tempdir)
      with tarfile.open(output_path, "r:bz2") as tar:
      try:
      tar.extractall(os.path.abspath(tempdirp))
      except Exception as err: #tar file is empty
      print(err)
      buffer_path = os.path.join(tempdir, os.path.basename(file_name))
      if replace or (buffer_path not in list(os.path.abspath(f) for f in tempdirp.iterdir())):
      with open(buffer_path, "wb") as f:
      f.write(buffer)

      with tarfile.open(output_path, "w:bz2") as tar:
      for file in tempdirp.iterdir():
      try:
      tar.add(file, arcname=os.path.basename(os.path.normpath(file)))
      except Exception as err:
      print(err)

      if __name__ == "__main__":
      path = "./test.tar.gz"
      buffer = "Test String"
      filename = "somefile"
      for i in range(1, 100):
      print(time.time())
      append_tar_file(buffer, filename+str(i), path)






      share|improve this question













      As you might know, Python's tarfile does not have the ability to append to files compressed with e.g. gz or bz2. I tried to implement this functionality which works fine but operates slowly. The following function accepts a string or bytes object and appends it as a file to an existing tarfile. I'm not sure if this code has the best performance, and it may have issues. I also tried writing files to memory instead of to a temporary directory, but this didn't impact performance.



      import os
      import tarfile
      import tempfile
      import time
      from pathlib import Path


      def append_tar_file(buffer, file_name, output_path, replace=True):
      """
      append a buffer to an existing tar file
      """
      # extract files
      # check for existing file and overwrite if need to
      # compress files
      if not os.path.isfile(output_path):
      return
      buffer = buffer.encode("utf-8") if isinstance(buffer, str) else buffer

      with tempfile.TemporaryDirectory() as tempdir:
      tempdirp = Path(tempdir)
      with tarfile.open(output_path, "r:bz2") as tar:
      try:
      tar.extractall(os.path.abspath(tempdirp))
      except Exception as err: #tar file is empty
      print(err)
      buffer_path = os.path.join(tempdir, os.path.basename(file_name))
      if replace or (buffer_path not in list(os.path.abspath(f) for f in tempdirp.iterdir())):
      with open(buffer_path, "wb") as f:
      f.write(buffer)

      with tarfile.open(output_path, "w:bz2") as tar:
      for file in tempdirp.iterdir():
      try:
      tar.add(file, arcname=os.path.basename(os.path.normpath(file)))
      except Exception as err:
      print(err)

      if __name__ == "__main__":
      path = "./test.tar.gz"
      buffer = "Test String"
      filename = "somefile"
      for i in range(1, 100):
      print(time.time())
      append_tar_file(buffer, filename+str(i), path)








      share|improve this question












      share|improve this question




      share|improve this question








      edited Jul 29 at 17:44









      Daniel

      4,0632834




      4,0632834









      asked Jul 29 at 7:50









      Masoud R.

      355




      355




















          2 Answers
          2






          active

          oldest

          votes

















          up vote
          5
          down vote



          accepted










          Indeed, the tarfile package doesn't support appending to a compressed tar.
          But I think you can do better than your current attempt.
          Instead of extracting the content to disk, you could keep it in memory, write and append to a new compressed file, and finally rename the compressed file.
          That way you will write a single temporary file,
          instead of many.
          The reduced disk I/O should improve the performance.



          def append_tar_file(buf, file_name, output_path, replace=True):
          """
          append a buf to an existing tar file if not already there, or if replace=True
          """
          if not os.path.isfile(output_path):
          return

          with tempfile.TemporaryDirectory() as tempdir:
          tmp_path = os.path.join(tempdir, 'tmp.tar.bz2')

          with tarfile.open(output_path, "r:bz2") as tar:
          if not replace:
          if file_name in (member.name for member in tar):
          return

          if isinstance(buf, str):
          buf = buf.encode("utf-8")

          fileobj = BytesIO(buf)
          tarinfo = tarfile.TarInfo(file_name)
          tarinfo.size = len(fileobj.getvalue())

          with tarfile.open(tmp_path, "w:bz2") as tmp:
          for member in tar:
          if member.name != file_name:
          tmp.addfile(member, tar.extractfile(member.name))
          tmp.addfile(tarinfo, fileobj)

          os.rename(tmp_path, output_path)





          share|improve this answer























          • Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
            – Masoud R.
            Jul 30 at 7:59

















          up vote
          4
          down vote













          Instead of hardcoding the compression scheme (and possibly compressing a .tar.gz file using BZIP2 as you do), you should try to infer that information. Note that tarfile let you open a compressed file without knowing the compression scheme using tarfile.open(filename, 'r:*') but there is no equivalent for writing the archive.



          Since the compression scheme understood by the tarfile module are usual extensions of the files, inspecting the suffixes of output_path should be enough:



          def get_compression(filename):
          suffixes = Path(filename).suffixes
          tar, *compression = (s.lstrip('.') for s in suffixes)

          if tar == 'tgz':
          if compression:
          raise RuntimeError('Too much suffixes, cannot infer compression scheme from '.format(''.join(suffixes)))
          return 'gz'

          if tar != 'tar':
          raise RuntimeError('Not a tar archive')

          if not compression:
          return ''

          try:
          compression, = compression
          except ValueError:
          raise RuntimeError('Too much compression scheme: '.format(', '.join(compression))) from None
          else:
          return compression


          Now you can use compression = get_compression(output_path) and then open the tar file using tarfile.open(<name>, 'r:'.format(compression)) and open the file for writing using tarfile.open(<name>, 'w:'.format(compression)).



          Note that I used RuntimeError here but you should probably come up with something less generic.






          share|improve this answer























          • Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
            – Masoud R.
            Jul 30 at 6:41










          • If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
            – Mathias Ettinger
            Jul 30 at 7:15










          • So you mean I should use tar.bz2 right?
            – Masoud R.
            Jul 30 at 7:17






          • 2




            If you want to store BZIP2 compressed data, definitely.
            – Mathias Ettinger
            Jul 30 at 7:23






          • 1




            On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
            – Mathias Ettinger
            Jul 30 at 7:33










          Your Answer




          StackExchange.ifUsing("editor", function ()
          return StackExchange.using("mathjaxEditing", function ()
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          );
          );
          , "mathjax-editing");

          StackExchange.ifUsing("editor", function ()
          StackExchange.using("externalEditor", function ()
          StackExchange.using("snippets", function ()
          StackExchange.snippets.init();
          );
          );
          , "code-snippets");

          StackExchange.ready(function()
          var channelOptions =
          tags: "".split(" "),
          id: "196"
          ;
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function()
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled)
          StackExchange.using("snippets", function()
          createEditor();
          );

          else
          createEditor();

          );

          function createEditor()
          StackExchange.prepareEditor(
          heartbeatType: 'answer',
          convertImagesToLinks: false,
          noModals: false,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          );



          );








           

          draft saved


          draft discarded


















          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f200514%2fappend-to-compressed-tar-file-with-performance%23new-answer', 'question_page');

          );

          Post as a guest






























          2 Answers
          2






          active

          oldest

          votes








          2 Answers
          2






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes








          up vote
          5
          down vote



          accepted










          Indeed, the tarfile package doesn't support appending to a compressed tar.
          But I think you can do better than your current attempt.
          Instead of extracting the content to disk, you could keep it in memory, write and append to a new compressed file, and finally rename the compressed file.
          That way you will write a single temporary file,
          instead of many.
          The reduced disk I/O should improve the performance.



          def append_tar_file(buf, file_name, output_path, replace=True):
          """
          append a buf to an existing tar file if not already there, or if replace=True
          """
          if not os.path.isfile(output_path):
          return

          with tempfile.TemporaryDirectory() as tempdir:
          tmp_path = os.path.join(tempdir, 'tmp.tar.bz2')

          with tarfile.open(output_path, "r:bz2") as tar:
          if not replace:
          if file_name in (member.name for member in tar):
          return

          if isinstance(buf, str):
          buf = buf.encode("utf-8")

          fileobj = BytesIO(buf)
          tarinfo = tarfile.TarInfo(file_name)
          tarinfo.size = len(fileobj.getvalue())

          with tarfile.open(tmp_path, "w:bz2") as tmp:
          for member in tar:
          if member.name != file_name:
          tmp.addfile(member, tar.extractfile(member.name))
          tmp.addfile(tarinfo, fileobj)

          os.rename(tmp_path, output_path)





          share|improve this answer























          • Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
            – Masoud R.
            Jul 30 at 7:59














          up vote
          5
          down vote



          accepted










          Indeed, the tarfile package doesn't support appending to a compressed tar.
          But I think you can do better than your current attempt.
          Instead of extracting the content to disk, you could keep it in memory, write and append to a new compressed file, and finally rename the compressed file.
          That way you will write a single temporary file,
          instead of many.
          The reduced disk I/O should improve the performance.



          def append_tar_file(buf, file_name, output_path, replace=True):
          """
          append a buf to an existing tar file if not already there, or if replace=True
          """
          if not os.path.isfile(output_path):
          return

          with tempfile.TemporaryDirectory() as tempdir:
          tmp_path = os.path.join(tempdir, 'tmp.tar.bz2')

          with tarfile.open(output_path, "r:bz2") as tar:
          if not replace:
          if file_name in (member.name for member in tar):
          return

          if isinstance(buf, str):
          buf = buf.encode("utf-8")

          fileobj = BytesIO(buf)
          tarinfo = tarfile.TarInfo(file_name)
          tarinfo.size = len(fileobj.getvalue())

          with tarfile.open(tmp_path, "w:bz2") as tmp:
          for member in tar:
          if member.name != file_name:
          tmp.addfile(member, tar.extractfile(member.name))
          tmp.addfile(tarinfo, fileobj)

          os.rename(tmp_path, output_path)





          share|improve this answer























          • Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
            – Masoud R.
            Jul 30 at 7:59












          up vote
          5
          down vote



          accepted







          up vote
          5
          down vote



          accepted






          Indeed, the tarfile package doesn't support appending to a compressed tar.
          But I think you can do better than your current attempt.
          Instead of extracting the content to disk, you could keep it in memory, write and append to a new compressed file, and finally rename the compressed file.
          That way you will write a single temporary file,
          instead of many.
          The reduced disk I/O should improve the performance.



          def append_tar_file(buf, file_name, output_path, replace=True):
          """
          append a buf to an existing tar file if not already there, or if replace=True
          """
          if not os.path.isfile(output_path):
          return

          with tempfile.TemporaryDirectory() as tempdir:
          tmp_path = os.path.join(tempdir, 'tmp.tar.bz2')

          with tarfile.open(output_path, "r:bz2") as tar:
          if not replace:
          if file_name in (member.name for member in tar):
          return

          if isinstance(buf, str):
          buf = buf.encode("utf-8")

          fileobj = BytesIO(buf)
          tarinfo = tarfile.TarInfo(file_name)
          tarinfo.size = len(fileobj.getvalue())

          with tarfile.open(tmp_path, "w:bz2") as tmp:
          for member in tar:
          if member.name != file_name:
          tmp.addfile(member, tar.extractfile(member.name))
          tmp.addfile(tarinfo, fileobj)

          os.rename(tmp_path, output_path)





          share|improve this answer















          Indeed, the tarfile package doesn't support appending to a compressed tar.
          But I think you can do better than your current attempt.
          Instead of extracting the content to disk, you could keep it in memory, write and append to a new compressed file, and finally rename the compressed file.
          That way you will write a single temporary file,
          instead of many.
          The reduced disk I/O should improve the performance.



          def append_tar_file(buf, file_name, output_path, replace=True):
          """
          append a buf to an existing tar file if not already there, or if replace=True
          """
          if not os.path.isfile(output_path):
          return

          with tempfile.TemporaryDirectory() as tempdir:
          tmp_path = os.path.join(tempdir, 'tmp.tar.bz2')

          with tarfile.open(output_path, "r:bz2") as tar:
          if not replace:
          if file_name in (member.name for member in tar):
          return

          if isinstance(buf, str):
          buf = buf.encode("utf-8")

          fileobj = BytesIO(buf)
          tarinfo = tarfile.TarInfo(file_name)
          tarinfo.size = len(fileobj.getvalue())

          with tarfile.open(tmp_path, "w:bz2") as tmp:
          for member in tar:
          if member.name != file_name:
          tmp.addfile(member, tar.extractfile(member.name))
          tmp.addfile(tarinfo, fileobj)

          os.rename(tmp_path, output_path)






          share|improve this answer















          share|improve this answer



          share|improve this answer








          edited Jul 30 at 9:06









          Daniel

          4,0632834




          4,0632834











          answered Jul 29 at 10:18









          janos

          95.2k12119342




          95.2k12119342











          • Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
            – Masoud R.
            Jul 30 at 7:59
















          • Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
            – Masoud R.
            Jul 30 at 7:59















          Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
          – Masoud R.
          Jul 30 at 7:59




          Can multiprocessing used while working with memory? I mean when adding files back to output tar file use it. When the compressed file getting larger, it would take more time to append.
          – Masoud R.
          Jul 30 at 7:59












          up vote
          4
          down vote













          Instead of hardcoding the compression scheme (and possibly compressing a .tar.gz file using BZIP2 as you do), you should try to infer that information. Note that tarfile let you open a compressed file without knowing the compression scheme using tarfile.open(filename, 'r:*') but there is no equivalent for writing the archive.



          Since the compression scheme understood by the tarfile module are usual extensions of the files, inspecting the suffixes of output_path should be enough:



          def get_compression(filename):
          suffixes = Path(filename).suffixes
          tar, *compression = (s.lstrip('.') for s in suffixes)

          if tar == 'tgz':
          if compression:
          raise RuntimeError('Too much suffixes, cannot infer compression scheme from '.format(''.join(suffixes)))
          return 'gz'

          if tar != 'tar':
          raise RuntimeError('Not a tar archive')

          if not compression:
          return ''

          try:
          compression, = compression
          except ValueError:
          raise RuntimeError('Too much compression scheme: '.format(', '.join(compression))) from None
          else:
          return compression


          Now you can use compression = get_compression(output_path) and then open the tar file using tarfile.open(<name>, 'r:'.format(compression)) and open the file for writing using tarfile.open(<name>, 'w:'.format(compression)).



          Note that I used RuntimeError here but you should probably come up with something less generic.






          share|improve this answer























          • Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
            – Masoud R.
            Jul 30 at 6:41










          • If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
            – Mathias Ettinger
            Jul 30 at 7:15










          • So you mean I should use tar.bz2 right?
            – Masoud R.
            Jul 30 at 7:17






          • 2




            If you want to store BZIP2 compressed data, definitely.
            – Mathias Ettinger
            Jul 30 at 7:23






          • 1




            On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
            – Mathias Ettinger
            Jul 30 at 7:33














          up vote
          4
          down vote













          Instead of hardcoding the compression scheme (and possibly compressing a .tar.gz file using BZIP2 as you do), you should try to infer that information. Note that tarfile let you open a compressed file without knowing the compression scheme using tarfile.open(filename, 'r:*') but there is no equivalent for writing the archive.



          Since the compression scheme understood by the tarfile module are usual extensions of the files, inspecting the suffixes of output_path should be enough:



          def get_compression(filename):
          suffixes = Path(filename).suffixes
          tar, *compression = (s.lstrip('.') for s in suffixes)

          if tar == 'tgz':
          if compression:
          raise RuntimeError('Too much suffixes, cannot infer compression scheme from '.format(''.join(suffixes)))
          return 'gz'

          if tar != 'tar':
          raise RuntimeError('Not a tar archive')

          if not compression:
          return ''

          try:
          compression, = compression
          except ValueError:
          raise RuntimeError('Too much compression scheme: '.format(', '.join(compression))) from None
          else:
          return compression


          Now you can use compression = get_compression(output_path) and then open the tar file using tarfile.open(<name>, 'r:'.format(compression)) and open the file for writing using tarfile.open(<name>, 'w:'.format(compression)).



          Note that I used RuntimeError here but you should probably come up with something less generic.






          share|improve this answer























          • Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
            – Masoud R.
            Jul 30 at 6:41










          • If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
            – Mathias Ettinger
            Jul 30 at 7:15










          • So you mean I should use tar.bz2 right?
            – Masoud R.
            Jul 30 at 7:17






          • 2




            If you want to store BZIP2 compressed data, definitely.
            – Mathias Ettinger
            Jul 30 at 7:23






          • 1




            On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
            – Mathias Ettinger
            Jul 30 at 7:33












          up vote
          4
          down vote










          up vote
          4
          down vote









          Instead of hardcoding the compression scheme (and possibly compressing a .tar.gz file using BZIP2 as you do), you should try to infer that information. Note that tarfile let you open a compressed file without knowing the compression scheme using tarfile.open(filename, 'r:*') but there is no equivalent for writing the archive.



          Since the compression scheme understood by the tarfile module are usual extensions of the files, inspecting the suffixes of output_path should be enough:



          def get_compression(filename):
          suffixes = Path(filename).suffixes
          tar, *compression = (s.lstrip('.') for s in suffixes)

          if tar == 'tgz':
          if compression:
          raise RuntimeError('Too much suffixes, cannot infer compression scheme from '.format(''.join(suffixes)))
          return 'gz'

          if tar != 'tar':
          raise RuntimeError('Not a tar archive')

          if not compression:
          return ''

          try:
          compression, = compression
          except ValueError:
          raise RuntimeError('Too much compression scheme: '.format(', '.join(compression))) from None
          else:
          return compression


          Now you can use compression = get_compression(output_path) and then open the tar file using tarfile.open(<name>, 'r:'.format(compression)) and open the file for writing using tarfile.open(<name>, 'w:'.format(compression)).



          Note that I used RuntimeError here but you should probably come up with something less generic.






          share|improve this answer















          Instead of hardcoding the compression scheme (and possibly compressing a .tar.gz file using BZIP2 as you do), you should try to infer that information. Note that tarfile let you open a compressed file without knowing the compression scheme using tarfile.open(filename, 'r:*') but there is no equivalent for writing the archive.



          Since the compression scheme understood by the tarfile module are usual extensions of the files, inspecting the suffixes of output_path should be enough:



          def get_compression(filename):
          suffixes = Path(filename).suffixes
          tar, *compression = (s.lstrip('.') for s in suffixes)

          if tar == 'tgz':
          if compression:
          raise RuntimeError('Too much suffixes, cannot infer compression scheme from '.format(''.join(suffixes)))
          return 'gz'

          if tar != 'tar':
          raise RuntimeError('Not a tar archive')

          if not compression:
          return ''

          try:
          compression, = compression
          except ValueError:
          raise RuntimeError('Too much compression scheme: '.format(', '.join(compression))) from None
          else:
          return compression


          Now you can use compression = get_compression(output_path) and then open the tar file using tarfile.open(<name>, 'r:'.format(compression)) and open the file for writing using tarfile.open(<name>, 'w:'.format(compression)).



          Note that I used RuntimeError here but you should probably come up with something less generic.







          share|improve this answer















          share|improve this answer



          share|improve this answer








          edited Jul 29 at 11:57


























          answered Jul 29 at 11:51









          Mathias Ettinger

          21.7k32875




          21.7k32875











          • Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
            – Masoud R.
            Jul 30 at 6:41










          • If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
            – Mathias Ettinger
            Jul 30 at 7:15










          • So you mean I should use tar.bz2 right?
            – Masoud R.
            Jul 30 at 7:17






          • 2




            If you want to store BZIP2 compressed data, definitely.
            – Mathias Ettinger
            Jul 30 at 7:23






          • 1




            On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
            – Mathias Ettinger
            Jul 30 at 7:33
















          • Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
            – Masoud R.
            Jul 30 at 6:41










          • If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
            – Mathias Ettinger
            Jul 30 at 7:15










          • So you mean I should use tar.bz2 right?
            – Masoud R.
            Jul 30 at 7:17






          • 2




            If you want to store BZIP2 compressed data, definitely.
            – Mathias Ettinger
            Jul 30 at 7:23






          • 1




            On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
            – Mathias Ettinger
            Jul 30 at 7:33















          Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
          – Masoud R.
          Jul 30 at 6:41




          Thanks. The function return gz for the files that created with bz2 in tarfile. Thus it should return bz2 instead of gz.
          – Masoud R.
          Jul 30 at 6:41












          If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
          – Mathias Ettinger
          Jul 30 at 7:15




          If your file is named something.tar.gz, it is not coherent to use anything else than GZIP compression to create the file. This is the assumption this function is doing, hence the choice to only check the filename and not trying to do anything fancy with the file content. If you store BZIP2 compressed data in a .tar.gz this is your mistake to start with.
          – Mathias Ettinger
          Jul 30 at 7:15












          So you mean I should use tar.bz2 right?
          – Masoud R.
          Jul 30 at 7:17




          So you mean I should use tar.bz2 right?
          – Masoud R.
          Jul 30 at 7:17




          2




          2




          If you want to store BZIP2 compressed data, definitely.
          – Mathias Ettinger
          Jul 30 at 7:23




          If you want to store BZIP2 compressed data, definitely.
          – Mathias Ettinger
          Jul 30 at 7:23




          1




          1




          On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
          – Mathias Ettinger
          Jul 30 at 7:33




          On the other hand, if you want something fancier based on the file content rather than its name, you can check how tarfile handle the r:* mode. Basically, it tries to open the file using, in turn gzip.GzipFile, bz2.BZ2File, lzma.LZMAFile and if all failed fallback to trying without compression.
          – Mathias Ettinger
          Jul 30 at 7:33












           

          draft saved


          draft discarded


























           


          draft saved


          draft discarded














          StackExchange.ready(
          function ()
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f200514%2fappend-to-compressed-tar-file-with-performance%23new-answer', 'question_page');

          );

          Post as a guest













































































          Popular posts from this blog

          Greedy Best First Search implementation in Rust

          Function to Return a JSON Like Objects Using VBA Collections and Arrays

          C++11 CLH Lock Implementation