Function to generate pyspark diff and return differences in line

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
1
down vote

favorite

I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for loop calling .join() hundreds of times. I am stuck, or maybe tired and looking for some suggestions!

from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
 diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
 if pk in df_one.schema.names:
 df_one.schema.names.remove(pk)
 for col in df_one.schema.names:
# ensure that col exists in df2
 if col in df_two.schema.names:
 df_two_ = df_two.select(pk, col)
 .withColumnRenamed(col, "_compare".format(col))
 df = df_one.select(pk, col)
 .subtract(df_two_)
 .join(
 df_two_,
 pk, 
 "left_outer"
 )
# concat df columns to display comparison
 concat_df = df
 .withColumn("_diff".format(col), 
 sf.concat(
 sf.col(col), 
 sf.lit("_compare:_"), 
 sf.col("_compare".format(col))))
 .select(pk, "_diff".format(col))
# join column 'diff' with initialized df 
 diff_df = diff_df.join(concat_df, pk, "left_outer")
 return diff_df

Sample result:

98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match

edited May 22 at 12:48

asked May 22 at 4:43

shannona2013

This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â€“Â Mast
May 22 at 5:29

It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â€“Â shannona2013
May 22 at 12:47

add a commentÂ |Â

up vote
1
down vote

favorite

from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
 diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
 if pk in df_one.schema.names:
 df_one.schema.names.remove(pk)
 for col in df_one.schema.names:
# ensure that col exists in df2
 if col in df_two.schema.names:
 df_two_ = df_two.select(pk, col)
 .withColumnRenamed(col, "_compare".format(col))
 df = df_one.select(pk, col)
 .subtract(df_two_)
 .join(
 df_two_,
 pk, 
 "left_outer"
 )
# concat df columns to display comparison
 concat_df = df
 .withColumn("_diff".format(col), 
 sf.concat(
 sf.col(col), 
 sf.lit("_compare:_"), 
 sf.col("_compare".format(col))))
 .select(pk, "_diff".format(col))
# join column 'diff' with initialized df 
 diff_df = diff_df.join(concat_df, pk, "left_outer")
 return diff_df

Sample result:

98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match

edited May 22 at 12:48

asked May 22 at 4:43

shannona2013

This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â€“Â Mast
May 22 at 5:29

It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â€“Â shannona2013
May 22 at 12:47

add a commentÂ |Â

up vote
1
down vote

favorite

from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
 diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
 if pk in df_one.schema.names:
 df_one.schema.names.remove(pk)
 for col in df_one.schema.names:
# ensure that col exists in df2
 if col in df_two.schema.names:
 df_two_ = df_two.select(pk, col)
 .withColumnRenamed(col, "_compare".format(col))
 df = df_one.select(pk, col)
 .subtract(df_two_)
 .join(
 df_two_,
 pk, 
 "left_outer"
 )
# concat df columns to display comparison
 concat_df = df
 .withColumn("_diff".format(col), 
 sf.concat(
 sf.col(col), 
 sf.lit("_compare:_"), 
 sf.col("_compare".format(col))))
 .select(pk, "_diff".format(col))
# join column 'diff' with initialized df 
 diff_df = diff_df.join(concat_df, pk, "left_outer")
 return diff_df

Sample result:

98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match

edited May 22 at 12:48

asked May 22 at 4:43

shannona2013

from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
 diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
 if pk in df_one.schema.names:
 df_one.schema.names.remove(pk)
 for col in df_one.schema.names:
# ensure that col exists in df2
 if col in df_two.schema.names:
 df_two_ = df_two.select(pk, col)
 .withColumnRenamed(col, "_compare".format(col))
 df = df_one.select(pk, col)
 .subtract(df_two_)
 .join(
 df_two_,
 pk, 
 "left_outer"
 )
# concat df columns to display comparison
 concat_df = df
 .withColumn("_diff".format(col), 
 sf.concat(
 sf.col(col), 
 sf.lit("_compare:_"), 
 sf.col("_compare".format(col))))
 .select(pk, "_diff".format(col))
# join column 'diff' with initialized df 
 diff_df = diff_df.join(concat_df, pk, "left_outer")
 return diff_df

Sample result:

98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match

edited May 22 at 12:48

asked May 22 at 4:43

shannona2013

edited May 22 at 12:48

asked May 22 at 4:43

shannona2013

asked May 22 at 4:43

shannona2013

asked May 22 at 4:43

shannona2013

This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â€“Â Mast
May 22 at 5:29

It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â€“Â shannona2013
May 22 at 12:47

add a commentÂ |Â

This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â€“Â Mast
May 22 at 5:29

It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â€“Â shannona2013
May 22 at 12:47

This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â€“Â Mast
May 22 at 5:29

It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â€“Â shannona2013
May 22 at 12:47

add a commentÂ |Â

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194918%2ffunction-to-generate-pyspark-diff-and-return-differences-in-line%23new-answer', 'question_page');

);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr