Formatting Excel data into smaller pivot tables

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
4
down vote

favorite

The code below (Distribution.py) reads the results I get from a simulation I am running and formats them into smaller pivot tables. The data comes out as excel files with several worksheets. With smaller data files it is fast but quite slow processing larger data files. Any hints on how I can improve how I can improve its speed of operation?

The sample data below is named Distribution1, it is a worksheet in the excel file sample.xlsx

Distribution.py

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):

 """A function that analysis LUTI model trip distribution results and return pivot

 tables of given scenarios or variable combinations"""

 # Define variables
 temp = 
 list_dfs = 
 final_dfs_list = 

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=None)

 # Make a pandas data frame of a given worksheet
 d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 del d['Variable']

 # Get unique values of each column in the data frame
 num_pur = d['Purpose'].unique()
 num_time = d['Time Period'].unique()
 num_mode = d['Mode'].unique()

 # Create sub pivot tables from the data
 for time in num_time:

 try:
 tp = d[d['Time Period'] == time]

 for pur in num_pur:

 pivoted = tp.loc[tp['Purpose'] == pur].pivot_table(index=['Zone (origin)', 'Time Period',
 'Purpose', 'Mode'], columns=['Zone (destination)'], values=['1995-Jan-01 00:00:00',
 '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00']).fillna(0.0)

 list_dfs.append(pivoted)

 except IndexError: pass

 # Analyse further the tables with two values in the mode column
 for df in list_dfs:
 mask = df.index.get_level_values(3) == 'Bus'

 df1 = df[mask]
 temp.append(df1)

 df2 = df[~mask]
 temp.append(df2)

 # Eliminate redundant or empty pivot
 final_dfs_list = [i for idx, i in enumerate(temp) if i.index.values.any()]

 return final_dfs_list


def save_xls(list_dfs, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 for n, df in enumerate(list_dfs):
 df.to_excel(writer, 'sheet%s' % n)
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor('sample.xlsx', 'Distribution1'), 'result.xlsx')

Distribution1

Formatted for better readability:

Variable Time Period Purpose Mode Zone (origin) Zone (destination) 1995-Jan-01 00:00:00 2000-Jan-01 00:00:00 2005-Jan-01 00:00:00
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heathfield 0.001 3.19544E-07 0.004420692
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heideveld 0.001 1.49769E-10 1.88921E-06
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Helderberg Rural 0.001 3.072E-08 0.00012523
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Hout Bay 0.001 4.36081E-07 0.010432741
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Joostenberg Vlakte 0.001 2.81437E-08 0.00014551
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Kenilworth 0.001 8.54678E-06 0.082402039
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Dunoon 0.001 3.9958E-07 3.80314E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Durbanville 0.001 1.43952E-05 1.98133E-05
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Edgemead 0.001 5.70312E-07 7.6349E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Eersterivier 0.014476378 1.53594E-06 1.58987E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Elsies River 0.052003373 5.33659E-06 3.71889E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Epping Industria 0.090892934 9.43124E-11 6.70574E-11

edited Feb 28 at 17:41

200_success

123k14142399

asked Feb 28 at 10:23

Nobi

1426

Honestly, pd.read_excel is historically pretty slow and probably your biggest bottle neck. Is it possible to convert to csv?
â€“Â Tony
Feb 28 at 14:39

@Tony converting to csv is really not an option because of the file sizes and the number of worksheets in the excel.
â€“Â Nobi
Feb 28 at 15:06

add a commentÂ |Â

up vote
4
down vote

favorite

The sample data below is named Distribution1, it is a worksheet in the excel file sample.xlsx

Distribution.py

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):

 """A function that analysis LUTI model trip distribution results and return pivot

 tables of given scenarios or variable combinations"""

 # Define variables
 temp = 
 list_dfs = 
 final_dfs_list = 

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=None)

 # Make a pandas data frame of a given worksheet
 d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 del d['Variable']

 # Get unique values of each column in the data frame
 num_pur = d['Purpose'].unique()
 num_time = d['Time Period'].unique()
 num_mode = d['Mode'].unique()

 # Create sub pivot tables from the data
 for time in num_time:

 try:
 tp = d[d['Time Period'] == time]

 for pur in num_pur:

 pivoted = tp.loc[tp['Purpose'] == pur].pivot_table(index=['Zone (origin)', 'Time Period',
 'Purpose', 'Mode'], columns=['Zone (destination)'], values=['1995-Jan-01 00:00:00',
 '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00']).fillna(0.0)

 list_dfs.append(pivoted)

 except IndexError: pass

 # Analyse further the tables with two values in the mode column
 for df in list_dfs:
 mask = df.index.get_level_values(3) == 'Bus'

 df1 = df[mask]
 temp.append(df1)

 df2 = df[~mask]
 temp.append(df2)

 # Eliminate redundant or empty pivot
 final_dfs_list = [i for idx, i in enumerate(temp) if i.index.values.any()]

 return final_dfs_list


def save_xls(list_dfs, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 for n, df in enumerate(list_dfs):
 df.to_excel(writer, 'sheet%s' % n)
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor('sample.xlsx', 'Distribution1'), 'result.xlsx')

Distribution1

Formatted for better readability:

Variable Time Period Purpose Mode Zone (origin) Zone (destination) 1995-Jan-01 00:00:00 2000-Jan-01 00:00:00 2005-Jan-01 00:00:00
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heathfield 0.001 3.19544E-07 0.004420692
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heideveld 0.001 1.49769E-10 1.88921E-06
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Helderberg Rural 0.001 3.072E-08 0.00012523
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Hout Bay 0.001 4.36081E-07 0.010432741
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Joostenberg Vlakte 0.001 2.81437E-08 0.00014551
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Kenilworth 0.001 8.54678E-06 0.082402039
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Dunoon 0.001 3.9958E-07 3.80314E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Durbanville 0.001 1.43952E-05 1.98133E-05
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Edgemead 0.001 5.70312E-07 7.6349E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Eersterivier 0.014476378 1.53594E-06 1.58987E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Elsies River 0.052003373 5.33659E-06 3.71889E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Epping Industria 0.090892934 9.43124E-11 6.70574E-11

edited Feb 28 at 17:41

200_success

123k14142399

asked Feb 28 at 10:23

Nobi

1426

Honestly, pd.read_excel is historically pretty slow and probably your biggest bottle neck. Is it possible to convert to csv?
â€“Â Tony
Feb 28 at 14:39

@Tony converting to csv is really not an option because of the file sizes and the number of worksheets in the excel.
â€“Â Nobi
Feb 28 at 15:06

add a commentÂ |Â

up vote
4
down vote

favorite

The sample data below is named Distribution1, it is a worksheet in the excel file sample.xlsx

Distribution.py

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):

 """A function that analysis LUTI model trip distribution results and return pivot

 tables of given scenarios or variable combinations"""

 # Define variables
 temp = 
 list_dfs = 
 final_dfs_list = 

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=None)

 # Make a pandas data frame of a given worksheet
 d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 del d['Variable']

 # Get unique values of each column in the data frame
 num_pur = d['Purpose'].unique()
 num_time = d['Time Period'].unique()
 num_mode = d['Mode'].unique()

 # Create sub pivot tables from the data
 for time in num_time:

 try:
 tp = d[d['Time Period'] == time]

 for pur in num_pur:

 pivoted = tp.loc[tp['Purpose'] == pur].pivot_table(index=['Zone (origin)', 'Time Period',
 'Purpose', 'Mode'], columns=['Zone (destination)'], values=['1995-Jan-01 00:00:00',
 '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00']).fillna(0.0)

 list_dfs.append(pivoted)

 except IndexError: pass

 # Analyse further the tables with two values in the mode column
 for df in list_dfs:
 mask = df.index.get_level_values(3) == 'Bus'

 df1 = df[mask]
 temp.append(df1)

 df2 = df[~mask]
 temp.append(df2)

 # Eliminate redundant or empty pivot
 final_dfs_list = [i for idx, i in enumerate(temp) if i.index.values.any()]

 return final_dfs_list


def save_xls(list_dfs, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 for n, df in enumerate(list_dfs):
 df.to_excel(writer, 'sheet%s' % n)
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor('sample.xlsx', 'Distribution1'), 'result.xlsx')

Distribution1

Formatted for better readability:

Variable Time Period Purpose Mode Zone (origin) Zone (destination) 1995-Jan-01 00:00:00 2000-Jan-01 00:00:00 2005-Jan-01 00:00:00
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heathfield 0.001 3.19544E-07 0.004420692
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heideveld 0.001 1.49769E-10 1.88921E-06
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Helderberg Rural 0.001 3.072E-08 0.00012523
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Hout Bay 0.001 4.36081E-07 0.010432741
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Joostenberg Vlakte 0.001 2.81437E-08 0.00014551
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Kenilworth 0.001 8.54678E-06 0.082402039
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Dunoon 0.001 3.9958E-07 3.80314E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Durbanville 0.001 1.43952E-05 1.98133E-05
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Edgemead 0.001 5.70312E-07 7.6349E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Eersterivier 0.014476378 1.53594E-06 1.58987E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Elsies River 0.052003373 5.33659E-06 3.71889E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Epping Industria 0.090892934 9.43124E-11 6.70574E-11

edited Feb 28 at 17:41

200_success

123k14142399

asked Feb 28 at 10:23

Nobi

1426

The sample data below is named Distribution1, it is a worksheet in the excel file sample.xlsx

Distribution.py

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):

 """A function that analysis LUTI model trip distribution results and return pivot

 tables of given scenarios or variable combinations"""

 # Define variables
 temp = 
 list_dfs = 
 final_dfs_list = 

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=None)

 # Make a pandas data frame of a given worksheet
 d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 del d['Variable']

 # Get unique values of each column in the data frame
 num_pur = d['Purpose'].unique()
 num_time = d['Time Period'].unique()
 num_mode = d['Mode'].unique()

 # Create sub pivot tables from the data
 for time in num_time:

 try:
 tp = d[d['Time Period'] == time]

 for pur in num_pur:

 pivoted = tp.loc[tp['Purpose'] == pur].pivot_table(index=['Zone (origin)', 'Time Period',
 'Purpose', 'Mode'], columns=['Zone (destination)'], values=['1995-Jan-01 00:00:00',
 '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00']).fillna(0.0)

 list_dfs.append(pivoted)

 except IndexError: pass

 # Analyse further the tables with two values in the mode column
 for df in list_dfs:
 mask = df.index.get_level_values(3) == 'Bus'

 df1 = df[mask]
 temp.append(df1)

 df2 = df[~mask]
 temp.append(df2)

 # Eliminate redundant or empty pivot
 final_dfs_list = [i for idx, i in enumerate(temp) if i.index.values.any()]

 return final_dfs_list


def save_xls(list_dfs, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 for n, df in enumerate(list_dfs):
 df.to_excel(writer, 'sheet%s' % n)
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor('sample.xlsx', 'Distribution1'), 'result.xlsx')

Distribution1

Formatted for better readability:

Variable Time Period Purpose Mode Zone (origin) Zone (destination) 1995-Jan-01 00:00:00 2000-Jan-01 00:00:00 2005-Jan-01 00:00:00
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heathfield 0.001 3.19544E-07 0.004420692
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Heideveld 0.001 1.49769E-10 1.88921E-06
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Helderberg Rural 0.001 3.072E-08 0.00012523
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Hout Bay 0.001 4.36081E-07 0.010432741
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Joostenberg Vlakte 0.001 2.81437E-08 0.00014551
Trips Rest_of_day Home_Others_Car Bus Zonnebloem Kenilworth 0.001 8.54678E-06 0.082402039
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Dunoon 0.001 3.9958E-07 3.80314E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Durbanville 0.001 1.43952E-05 1.98133E-05
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Edgemead 0.001 5.70312E-07 7.6349E-07
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Eersterivier 0.014476378 1.53594E-06 1.58987E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Elsies River 0.052003373 5.33659E-06 3.71889E-06
Trips Rest_of_day Home_Others_Minibus Car+Minibus Airport Industria Epping Industria 0.090892934 9.43124E-11 6.70574E-11

edited Feb 28 at 17:41

200_success

123k14142399

asked Feb 28 at 10:23

Nobi

1426

edited Feb 28 at 17:41

200_success

123k14142399

edited Feb 28 at 17:41

200_success

123k14142399

edited Feb 28 at 17:41

200_success

123k14142399

asked Feb 28 at 10:23

Nobi

1426

asked Feb 28 at 10:23

Nobi

1426

asked Feb 28 at 10:23

Nobi

1426

Honestly, pd.read_excel is historically pretty slow and probably your biggest bottle neck. Is it possible to convert to csv?
â€“Â Tony
Feb 28 at 14:39

@Tony converting to csv is really not an option because of the file sizes and the number of worksheets in the excel.
â€“Â Nobi
Feb 28 at 15:06

add a commentÂ |Â

Honestly, pd.read_excel is historically pretty slow and probably your biggest bottle neck. Is it possible to convert to csv?
â€“Â Tony
Feb 28 at 14:39

@Tony converting to csv is really not an option because of the file sizes and the number of worksheets in the excel.
â€“Â Nobi
Feb 28 at 15:06

Honestly, pd.read_excel is historically pretty slow and probably your biggest bottle neck. Is it possible to convert to csv?
â€“Â Tony
Feb 28 at 14:39

@Tony converting to csv is really not an option because of the file sizes and the number of worksheets in the excel.
â€“Â Nobi
Feb 28 at 15:06

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
2
down vote

accepted

Alright well if converting to a csv isn't possible, there's some improvements we can make, but pd.read_excel will probably always be your bottle neck. Regardless here's some small fixes that might help:

First, which will make more sense after the other parts, but most importantly maybe i'm missing something with your main function... you just pass in one worksheet name, yet you load the entire workbook? *See NOTE:

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):
 """A function that analysis LUTI model trip distribution results and return pivot
 tables of given scenarios or variable combinations"""

 # # you won't need to initialize any lists because we're just yielding `pd.DataFrames`
 # # Define variables
 # temp = 
 # list_dfs = 
 # final_dfs_list = 

 # *NOTE: This might actually be why performance is so bad
 # if you're going to load into pandas the workbook, and then cut out a worksheet,
 # why not just only load the worksheet?

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=sheetname)

 # # Make a pandas data frame of a given worksheet
 # d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 # df.drop(<column name>, <0/1>) #0 for row, 1 for column
 d = d.drop('Variable', 1)

Second, converting set(df[col]) will be faster than df.unique():

 # tested on df.shape()==(10, 10000000) with df["a"]==random.choice(list("abc"))
 %timeit -n 25 df["a"].unique() # 221 ms Ã‚Â± 2.93 ms per loop
 %timeit -n 25 set(df["a"]) # 171 ms Ã‚Â± 1.2 ms per loop

 # In your code just remove these unique statements, and put them in the for loop
 # NOTE: you won't find num_mode here because I didn't see it used anywhere

Third, might as well combine the two for loops:

 # Create sub pivot tables from the data
 for time in set(d(["Time Period"]):
 try:
 # No time diffs between d[bool] and d.loc[bool]
 # it's just nice to stay consistent
 tp = d.loc[d['Time Period'] == time]

 for pur in set(d["Purpose"]):
 # Note it's common in big pandas commands to wrap them in ()
 # so that you don't need to use multi-line seperator 
 # and then line everything up for readability
 pivoted = (tp.loc[tp['Purpose'] == pur]
 .pivot_table(index=['Zone (origin)', 'Time Period', 'Purpose', 'Mode'], 
 columns=['Zone (destination)'], 
 values=['1995-Jan-01 00:00:00', '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00'])
 .fillna(0.0))

 # ... mask loop below ...

And lastly, by restructuring this into a generator you might see some performance boost:

 # you can also just test for empty index rather than enumerate
 # without your data i'm not sure this might be bad because 
 # the try could of already broke here?
 if (pivoted.index.any()):
 yield None, None

 mask = pivoted.index.get_level_values(3) == "Bus"
 mask_true = pivoted[mask]
 mask_false = pivoted[~mask]

 yield mask_true, mask_false

 except IndexError:
 yield None, None

With a little parameter tweaking turning this into a generator shouldn't be to hard. You'll want to test against None though as it's a possibility for the error catching and empty pivots.

def save_xls(generator, input_file, worksheet, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 increment = 0
 for mask_true, mask_false in generator(input_file, worksheet):
 if mask_true is not None and mask_false is not None:
 mask_true.to_excel(writer, 'sheet%s' & increment)
 increment += 1
 mask_false.to_excel(writer, 'sheet%s' % increment)
 increment += 1
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor, 'sample.xlsx', 'Distribution1', 'result.xlsx')

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188509%2fformatting-excel-data-into-smaller-pivot-tables%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
2
down vote

accepted

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):
 """A function that analysis LUTI model trip distribution results and return pivot
 tables of given scenarios or variable combinations"""

 # # you won't need to initialize any lists because we're just yielding `pd.DataFrames`
 # # Define variables
 # temp = 
 # list_dfs = 
 # final_dfs_list = 

 # *NOTE: This might actually be why performance is so bad
 # if you're going to load into pandas the workbook, and then cut out a worksheet,
 # why not just only load the worksheet?

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=sheetname)

 # # Make a pandas data frame of a given worksheet
 # d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 # df.drop(<column name>, <0/1>) #0 for row, 1 for column
 d = d.drop('Variable', 1)

Second, converting set(df[col]) will be faster than df.unique():

 # tested on df.shape()==(10, 10000000) with df["a"]==random.choice(list("abc"))
 %timeit -n 25 df["a"].unique() # 221 ms Ã‚Â± 2.93 ms per loop
 %timeit -n 25 set(df["a"]) # 171 ms Ã‚Â± 1.2 ms per loop

 # In your code just remove these unique statements, and put them in the for loop
 # NOTE: you won't find num_mode here because I didn't see it used anywhere

Third, might as well combine the two for loops:

 # Create sub pivot tables from the data
 for time in set(d(["Time Period"]):
 try:
 # No time diffs between d[bool] and d.loc[bool]
 # it's just nice to stay consistent
 tp = d.loc[d['Time Period'] == time]

 for pur in set(d["Purpose"]):
 # Note it's common in big pandas commands to wrap them in ()
 # so that you don't need to use multi-line seperator 
 # and then line everything up for readability
 pivoted = (tp.loc[tp['Purpose'] == pur]
 .pivot_table(index=['Zone (origin)', 'Time Period', 'Purpose', 'Mode'], 
 columns=['Zone (destination)'], 
 values=['1995-Jan-01 00:00:00', '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00'])
 .fillna(0.0))

 # ... mask loop below ...

And lastly, by restructuring this into a generator you might see some performance boost:

 # you can also just test for empty index rather than enumerate
 # without your data i'm not sure this might be bad because 
 # the try could of already broke here?
 if (pivoted.index.any()):
 yield None, None

 mask = pivoted.index.get_level_values(3) == "Bus"
 mask_true = pivoted[mask]
 mask_false = pivoted[~mask]

 yield mask_true, mask_false

 except IndexError:
 yield None, None

With a little parameter tweaking turning this into a generator shouldn't be to hard. You'll want to test against None though as it's a possibility for the error catching and empty pivots.

def save_xls(generator, input_file, worksheet, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 increment = 0
 for mask_true, mask_false in generator(input_file, worksheet):
 if mask_true is not None and mask_false is not None:
 mask_true.to_excel(writer, 'sheet%s' & increment)
 increment += 1
 mask_false.to_excel(writer, 'sheet%s' % increment)
 increment += 1
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor, 'sample.xlsx', 'Distribution1', 'result.xlsx')

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

add a commentÂ |Â

up vote
2
down vote

accepted

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):
 """A function that analysis LUTI model trip distribution results and return pivot
 tables of given scenarios or variable combinations"""

 # # you won't need to initialize any lists because we're just yielding `pd.DataFrames`
 # # Define variables
 # temp = 
 # list_dfs = 
 # final_dfs_list = 

 # *NOTE: This might actually be why performance is so bad
 # if you're going to load into pandas the workbook, and then cut out a worksheet,
 # why not just only load the worksheet?

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=sheetname)

 # # Make a pandas data frame of a given worksheet
 # d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 # df.drop(<column name>, <0/1>) #0 for row, 1 for column
 d = d.drop('Variable', 1)

Second, converting set(df[col]) will be faster than df.unique():

 # tested on df.shape()==(10, 10000000) with df["a"]==random.choice(list("abc"))
 %timeit -n 25 df["a"].unique() # 221 ms Ã‚Â± 2.93 ms per loop
 %timeit -n 25 set(df["a"]) # 171 ms Ã‚Â± 1.2 ms per loop

 # In your code just remove these unique statements, and put them in the for loop
 # NOTE: you won't find num_mode here because I didn't see it used anywhere

Third, might as well combine the two for loops:

 # Create sub pivot tables from the data
 for time in set(d(["Time Period"]):
 try:
 # No time diffs between d[bool] and d.loc[bool]
 # it's just nice to stay consistent
 tp = d.loc[d['Time Period'] == time]

 for pur in set(d["Purpose"]):
 # Note it's common in big pandas commands to wrap them in ()
 # so that you don't need to use multi-line seperator 
 # and then line everything up for readability
 pivoted = (tp.loc[tp['Purpose'] == pur]
 .pivot_table(index=['Zone (origin)', 'Time Period', 'Purpose', 'Mode'], 
 columns=['Zone (destination)'], 
 values=['1995-Jan-01 00:00:00', '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00'])
 .fillna(0.0))

 # ... mask loop below ...

And lastly, by restructuring this into a generator you might see some performance boost:

 # you can also just test for empty index rather than enumerate
 # without your data i'm not sure this might be bad because 
 # the try could of already broke here?
 if (pivoted.index.any()):
 yield None, None

 mask = pivoted.index.get_level_values(3) == "Bus"
 mask_true = pivoted[mask]
 mask_false = pivoted[~mask]

 yield mask_true, mask_false

 except IndexError:
 yield None, None

With a little parameter tweaking turning this into a generator shouldn't be to hard. You'll want to test against None though as it's a possibility for the error catching and empty pivots.

def save_xls(generator, input_file, worksheet, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 increment = 0
 for mask_true, mask_false in generator(input_file, worksheet):
 if mask_true is not None and mask_false is not None:
 mask_true.to_excel(writer, 'sheet%s' & increment)
 increment += 1
 mask_false.to_excel(writer, 'sheet%s' % increment)
 increment += 1
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor, 'sample.xlsx', 'Distribution1', 'result.xlsx')

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

add a commentÂ |Â

up vote
2
down vote

accepted

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):
 """A function that analysis LUTI model trip distribution results and return pivot
 tables of given scenarios or variable combinations"""

 # # you won't need to initialize any lists because we're just yielding `pd.DataFrames`
 # # Define variables
 # temp = 
 # list_dfs = 
 # final_dfs_list = 

 # *NOTE: This might actually be why performance is so bad
 # if you're going to load into pandas the workbook, and then cut out a worksheet,
 # why not just only load the worksheet?

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=sheetname)

 # # Make a pandas data frame of a given worksheet
 # d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 # df.drop(<column name>, <0/1>) #0 for row, 1 for column
 d = d.drop('Variable', 1)

Second, converting set(df[col]) will be faster than df.unique():

 # tested on df.shape()==(10, 10000000) with df["a"]==random.choice(list("abc"))
 %timeit -n 25 df["a"].unique() # 221 ms Ã‚Â± 2.93 ms per loop
 %timeit -n 25 set(df["a"]) # 171 ms Ã‚Â± 1.2 ms per loop

 # In your code just remove these unique statements, and put them in the for loop
 # NOTE: you won't find num_mode here because I didn't see it used anywhere

Third, might as well combine the two for loops:

 # Create sub pivot tables from the data
 for time in set(d(["Time Period"]):
 try:
 # No time diffs between d[bool] and d.loc[bool]
 # it's just nice to stay consistent
 tp = d.loc[d['Time Period'] == time]

 for pur in set(d["Purpose"]):
 # Note it's common in big pandas commands to wrap them in ()
 # so that you don't need to use multi-line seperator 
 # and then line everything up for readability
 pivoted = (tp.loc[tp['Purpose'] == pur]
 .pivot_table(index=['Zone (origin)', 'Time Period', 'Purpose', 'Mode'], 
 columns=['Zone (destination)'], 
 values=['1995-Jan-01 00:00:00', '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00'])
 .fillna(0.0))

 # ... mask loop below ...

And lastly, by restructuring this into a generator you might see some performance boost:

 # you can also just test for empty index rather than enumerate
 # without your data i'm not sure this might be bad because 
 # the try could of already broke here?
 if (pivoted.index.any()):
 yield None, None

 mask = pivoted.index.get_level_values(3) == "Bus"
 mask_true = pivoted[mask]
 mask_false = pivoted[~mask]

 yield mask_true, mask_false

 except IndexError:
 yield None, None

With a little parameter tweaking turning this into a generator shouldn't be to hard. You'll want to test against None though as it's a possibility for the error catching and empty pivots.

def save_xls(generator, input_file, worksheet, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 increment = 0
 for mask_true, mask_false in generator(input_file, worksheet):
 if mask_true is not None and mask_false is not None:
 mask_true.to_excel(writer, 'sheet%s' & increment)
 increment += 1
 mask_false.to_excel(writer, 'sheet%s' % increment)
 increment += 1
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor, 'sample.xlsx', 'Distribution1', 'result.xlsx')

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

import pandas as pd
from pandas import ExcelWriter

def distribution_data_processor(luti_data_file, sheetname):
 """A function that analysis LUTI model trip distribution results and return pivot
 tables of given scenarios or variable combinations"""

 # # you won't need to initialize any lists because we're just yielding `pd.DataFrames`
 # # Define variables
 # temp = 
 # list_dfs = 
 # final_dfs_list = 

 # *NOTE: This might actually be why performance is so bad
 # if you're going to load into pandas the workbook, and then cut out a worksheet,
 # why not just only load the worksheet?

 # Read excel input file and create a map of of all worksheets in Pandas
 df_map = pd.read_excel(luti_data_file, sheetname=sheetname)

 # # Make a pandas data frame of a given worksheet
 # d = df_map[sheetname]

 # Delete the Variable column since its values are redundant
 # df.drop(<column name>, <0/1>) #0 for row, 1 for column
 d = d.drop('Variable', 1)

Second, converting set(df[col]) will be faster than df.unique():

 # tested on df.shape()==(10, 10000000) with df["a"]==random.choice(list("abc"))
 %timeit -n 25 df["a"].unique() # 221 ms Ã‚Â± 2.93 ms per loop
 %timeit -n 25 set(df["a"]) # 171 ms Ã‚Â± 1.2 ms per loop

 # In your code just remove these unique statements, and put them in the for loop
 # NOTE: you won't find num_mode here because I didn't see it used anywhere

Third, might as well combine the two for loops:

 # Create sub pivot tables from the data
 for time in set(d(["Time Period"]):
 try:
 # No time diffs between d[bool] and d.loc[bool]
 # it's just nice to stay consistent
 tp = d.loc[d['Time Period'] == time]

 for pur in set(d["Purpose"]):
 # Note it's common in big pandas commands to wrap them in ()
 # so that you don't need to use multi-line seperator 
 # and then line everything up for readability
 pivoted = (tp.loc[tp['Purpose'] == pur]
 .pivot_table(index=['Zone (origin)', 'Time Period', 'Purpose', 'Mode'], 
 columns=['Zone (destination)'], 
 values=['1995-Jan-01 00:00:00', '2000-Jan-01 00:00:00', '2005-Jan-01 00:00:00'])
 .fillna(0.0))

 # ... mask loop below ...

And lastly, by restructuring this into a generator you might see some performance boost:

 # you can also just test for empty index rather than enumerate
 # without your data i'm not sure this might be bad because 
 # the try could of already broke here?
 if (pivoted.index.any()):
 yield None, None

 mask = pivoted.index.get_level_values(3) == "Bus"
 mask_true = pivoted[mask]
 mask_false = pivoted[~mask]

 yield mask_true, mask_false

 except IndexError:
 yield None, None

With a little parameter tweaking turning this into a generator shouldn't be to hard. You'll want to test against None though as it's a possibility for the error catching and empty pivots.

def save_xls(generator, input_file, worksheet, xls_path):

 """ A function to write the results of the distribution

 processor function above to file """

 writer = ExcelWriter(xls_path)
 increment = 0
 for mask_true, mask_false in generator(input_file, worksheet):
 if mask_true is not None and mask_false is not None:
 mask_true.to_excel(writer, 'sheet%s' & increment)
 increment += 1
 mask_false.to_excel(writer, 'sheet%s' % increment)
 increment += 1
 writer.save()


if __name__ == "__main__":
 #distribution_data_processor('sample.xlsx', 'Distribution1')
 save_xls(distribution_data_processor, 'sample.xlsx', 'Distribution1', 'result.xlsx')

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

edited Feb 28 at 18:14

answered Feb 28 at 17:01

Tony

22819

answered Feb 28 at 17:01

Tony

22819

answered Feb 28 at 17:01

Tony

22819

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

add a commentÂ |Â

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

I edited the last bit a bunch, I took a break and got some food. Now i'm skeptical of this solution. I'm thinking a groupby could of possibly worked just as well...
â€“Â Tony
Feb 28 at 18:20

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr