Function to generate pyspark diff and return differences in line

The name of the pictureThe name of the pictureThe name of the pictureClash Royale CLAN TAG#URR8PPP





.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;







up vote
1
down vote

favorite












I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for loop calling .join() hundreds of times. I am stuck, or maybe tired and looking for some suggestions!



from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df


Sample result:



98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match






share|improve this question





















  • This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
    – Mast
    May 22 at 5:29










  • It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
    – shannona2013
    May 22 at 12:47

















up vote
1
down vote

favorite












I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for loop calling .join() hundreds of times. I am stuck, or maybe tired and looking for some suggestions!



from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df


Sample result:



98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match






share|improve this question





















  • This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
    – Mast
    May 22 at 5:29










  • It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
    – shannona2013
    May 22 at 12:47













up vote
1
down vote

favorite









up vote
1
down vote

favorite











I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for loop calling .join() hundreds of times. I am stuck, or maybe tired and looking for some suggestions!



from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df


Sample result:



98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match






share|improve this question













I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for loop calling .join() hundreds of times. I am stuck, or maybe tired and looking for some suggestions!



from pyspark.sql import functions as sf


def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df


Sample result:



98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match








share|improve this question












share|improve this question




share|improve this question








edited May 22 at 12:48
























asked May 22 at 4:43









shannona2013

62




62











  • This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
    – Mast
    May 22 at 5:29










  • It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
    – shannona2013
    May 22 at 12:47

















  • This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
    – Mast
    May 22 at 5:29










  • It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
    – shannona2013
    May 22 at 12:47
















This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
– Mast
May 22 at 5:29




This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
– Mast
May 22 at 5:29












It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
– shannona2013
May 22 at 12:47





It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
– shannona2013
May 22 at 12:47
















active

oldest

votes











Your Answer




StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);



);








 

draft saved


draft discarded


















StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194918%2ffunction-to-generate-pyspark-diff-and-return-differences-in-line%23new-answer', 'question_page');

);

Post as a guest



































active

oldest

votes













active

oldest

votes









active

oldest

votes






active

oldest

votes










 

draft saved


draft discarded


























 


draft saved


draft discarded














StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194918%2ffunction-to-generate-pyspark-diff-and-return-differences-in-line%23new-answer', 'question_page');

);

Post as a guest













































































Popular posts from this blog

Greedy Best First Search implementation in Rust

Function to Return a JSON Like Objects Using VBA Collections and Arrays

C++11 CLH Lock Implementation