Function to generate pyspark diff and return differences in line
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
1
down vote
favorite
I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for
loop calling .join()
hundreds of times. I am stuck, or maybe tired and looking for some suggestions!
from pyspark.sql import functions as sf
def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df
Sample result:
98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match
python python-3.x apache-spark hadoop
add a comment |Â
up vote
1
down vote
favorite
I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for
loop calling .join()
hundreds of times. I am stuck, or maybe tired and looking for some suggestions!
from pyspark.sql import functions as sf
def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df
Sample result:
98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match
python python-3.x apache-spark hadoop
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47
add a comment |Â
up vote
1
down vote
favorite
up vote
1
down vote
favorite
I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for
loop calling .join()
hundreds of times. I am stuck, or maybe tired and looking for some suggestions!
from pyspark.sql import functions as sf
def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df
Sample result:
98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match
python python-3.x apache-spark hadoop
I have written a function that takes two pyspark dataframes and creates a diff in line. I am struggling to get it to scale with 100s of columns. I get around the for
loop calling .join()
hundreds of times. I am stuck, or maybe tired and looking for some suggestions!
from pyspark.sql import functions as sf
def diff_generate(df_one, df_two, pk):
# create df to join compared columns with
diff_df = df_one.select(pk)
# ensure the PK is not in the schema to iterate needed cols
if pk in df_one.schema.names:
df_one.schema.names.remove(pk)
for col in df_one.schema.names:
# ensure that col exists in df2
if col in df_two.schema.names:
df_two_ = df_two.select(pk, col)
.withColumnRenamed(col, "_compare".format(col))
df = df_one.select(pk, col)
.subtract(df_two_)
.join(
df_two_,
pk,
"left_outer"
)
# concat df columns to display comparison
concat_df = df
.withColumn("_diff".format(col),
sf.concat(
sf.col(col),
sf.lit("_compare:_"),
sf.col("_compare".format(col))))
.select(pk, "_diff".format(col))
# join column 'diff' with initialized df
diff_df = diff_df.join(concat_df, pk, "left_outer")
return diff_df
Sample result:
98723498,match,N_compare:_null,match,match,match,2018-05-15 18:37_compare:_2018-05-15 18:37:12,match,match
python python-3.x apache-spark hadoop
edited May 22 at 12:48
asked May 22 at 4:43
shannona2013
62
62
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47
add a comment |Â
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47
add a comment |Â
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f194918%2ffunction-to-generate-pyspark-diff-and-return-differences-in-line%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
This code is at least missing some imports. Example usage would be appreciated as well. You say you're having scaling problems. Up to what amount does it still work as intended? Or does it work for all amounts but just very slow?
â Mast
May 22 at 5:29
It works for all parts, just very slow when many columns are used - 100+...The number of rows doesn't seem to have much impact . I have added the missing import. Thanks!
â shannona2013
May 22 at 12:47