Subprocess CSV validator and multiprocessing on Python
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
4
down vote
favorite
This Python script takes a directory of CSV files and calls a Scala script which tests whether a file's contents match a given regular expression. The link to this Scala script can be found in the 3rd line.
That script takes a single file as argument, hence my creation of this Python script to feed it an entire directory worth of files, while also utilizing the maximum possible amount of CPU power.
Do you see any issues or potential improvements in my code?
"""
Command line API to CSV validator using Scala implementation from:
http://digital-preservation.github.io/csv-validator/#toc7
"""
PATH_TO_VALIDATOR = r"C:progcsvcsv-validator-cmd-1.2-RC2binvalidate.bat"
PATH_TO_CSV_FOLDER = r"C:progcsvCSVFiles"
PATH_TO_CSV_SCHEMA = r"C:progcsvocr-schema.csvs"
# Set defaults
CSV_ENCODING = "windows-1252"
CSV_SCHEMA_ENCODING = "UTF-8"
def open_csv(CSV_LIST):
import subprocess
# To be used to display a simple progress indicator
TOTAL_FILE_COUNT = len(CSV_LIST)
current_file_count = 1
with open("output.txt", 'w') as output:
for filename in CSV_LIST:
print("Processing file " + str(current_file_count) + "/" + str(TOTAL_FILE_COUNT))
output.write(filename + ': ')
validator = subprocess.Popen(
[PATH_TO_VALIDATOR, PATH_TO_CSV_FOLDER + "/" + filename, PATH_TO_CSV_SCHEMA, "--csv-encoding",
CSV_ENCODING, "--csv-schema-encoding", CSV_SCHEMA_ENCODING, '--fail-fast', 'true'], stdout=subprocess.PIPE)
result = validator.stdout.read()
output.write(result.decode('windows-1252'))
current_file_count += 1
# Split a list into n sublists of roughly equal size
def split_list(alist, wanted_parts=1):
length = len(alist)
return [alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
for i in range(wanted_parts)]
if __name__ == '__main__':
import argparse
import multiprocessing
import os
parser = argparse.ArgumentParser(description="Command line API to Scala CSV validator")
parser.add_argument('-pv', '--PATH_TO_VALIDATOR', help="Specify the path to csv-validator-cmd/bin/validator.bat",
required=True)
parser.add_argument('-pf', '--PATH_TO_CSV_FOLDER', help="Specify the path to the folder containing the csv files "
"you want to validate", required=True)
parser.add_argument('-ps', '--PATH_TO_CSV_SCHEMA', help="Specify the path to CSV schema you want to use to "
"validate the given files", required=True)
parser.add_argument('-cenc', '--CSV_ENCODING', help="Optional parameter to specify the encoding used by the CSV "
"files. Choose UTF-8 or windows-1252. Default windows-1252")
parser.add_argument('-csenc', '--CSV_SCHEMA_ENCODING', help="Optional parameter to specify the encoding used by "
"the CSV Schema. Choose UTF-8 or windows-1252. "
"Default UTF-8")
args = vars(parser.parse_args())
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
if args['CSV_SCHEMA_ENCODING'] is not None:
CSV_SCHEMA_ENCODING = args['CSV_SCHEMA_ENCODING']
PATH_TO_VALIDATOR = args["PATH_TO_VALIDATOR"]
PATH_TO_CSV_SCHEMA = args["PATH_TO_CSV_SCHEMA"]
PATH_TO_CSV_FOLDER = args["PATH_TO_CSV_FOLDER"]
CPU_COUNT = multiprocessing.cpu_count()
split_csv_directory = split_list(os.listdir(args["PATH_TO_CSV_FOLDER"]), wanted_parts=CPU_COUNT)
# Spawn a Process for each CPU on the system
for csv_list in split_csv_directory:
p = multiprocessing.Process(target=open_csv, args=(csv_list,))
p.start()
python child-process multiprocessing
add a comment |Â
up vote
4
down vote
favorite
This Python script takes a directory of CSV files and calls a Scala script which tests whether a file's contents match a given regular expression. The link to this Scala script can be found in the 3rd line.
That script takes a single file as argument, hence my creation of this Python script to feed it an entire directory worth of files, while also utilizing the maximum possible amount of CPU power.
Do you see any issues or potential improvements in my code?
"""
Command line API to CSV validator using Scala implementation from:
http://digital-preservation.github.io/csv-validator/#toc7
"""
PATH_TO_VALIDATOR = r"C:progcsvcsv-validator-cmd-1.2-RC2binvalidate.bat"
PATH_TO_CSV_FOLDER = r"C:progcsvCSVFiles"
PATH_TO_CSV_SCHEMA = r"C:progcsvocr-schema.csvs"
# Set defaults
CSV_ENCODING = "windows-1252"
CSV_SCHEMA_ENCODING = "UTF-8"
def open_csv(CSV_LIST):
import subprocess
# To be used to display a simple progress indicator
TOTAL_FILE_COUNT = len(CSV_LIST)
current_file_count = 1
with open("output.txt", 'w') as output:
for filename in CSV_LIST:
print("Processing file " + str(current_file_count) + "/" + str(TOTAL_FILE_COUNT))
output.write(filename + ': ')
validator = subprocess.Popen(
[PATH_TO_VALIDATOR, PATH_TO_CSV_FOLDER + "/" + filename, PATH_TO_CSV_SCHEMA, "--csv-encoding",
CSV_ENCODING, "--csv-schema-encoding", CSV_SCHEMA_ENCODING, '--fail-fast', 'true'], stdout=subprocess.PIPE)
result = validator.stdout.read()
output.write(result.decode('windows-1252'))
current_file_count += 1
# Split a list into n sublists of roughly equal size
def split_list(alist, wanted_parts=1):
length = len(alist)
return [alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
for i in range(wanted_parts)]
if __name__ == '__main__':
import argparse
import multiprocessing
import os
parser = argparse.ArgumentParser(description="Command line API to Scala CSV validator")
parser.add_argument('-pv', '--PATH_TO_VALIDATOR', help="Specify the path to csv-validator-cmd/bin/validator.bat",
required=True)
parser.add_argument('-pf', '--PATH_TO_CSV_FOLDER', help="Specify the path to the folder containing the csv files "
"you want to validate", required=True)
parser.add_argument('-ps', '--PATH_TO_CSV_SCHEMA', help="Specify the path to CSV schema you want to use to "
"validate the given files", required=True)
parser.add_argument('-cenc', '--CSV_ENCODING', help="Optional parameter to specify the encoding used by the CSV "
"files. Choose UTF-8 or windows-1252. Default windows-1252")
parser.add_argument('-csenc', '--CSV_SCHEMA_ENCODING', help="Optional parameter to specify the encoding used by "
"the CSV Schema. Choose UTF-8 or windows-1252. "
"Default UTF-8")
args = vars(parser.parse_args())
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
if args['CSV_SCHEMA_ENCODING'] is not None:
CSV_SCHEMA_ENCODING = args['CSV_SCHEMA_ENCODING']
PATH_TO_VALIDATOR = args["PATH_TO_VALIDATOR"]
PATH_TO_CSV_SCHEMA = args["PATH_TO_CSV_SCHEMA"]
PATH_TO_CSV_FOLDER = args["PATH_TO_CSV_FOLDER"]
CPU_COUNT = multiprocessing.cpu_count()
split_csv_directory = split_list(os.listdir(args["PATH_TO_CSV_FOLDER"]), wanted_parts=CPU_COUNT)
# Spawn a Process for each CPU on the system
for csv_list in split_csv_directory:
p = multiprocessing.Process(target=open_csv, args=(csv_list,))
p.start()
python child-process multiprocessing
2
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27
add a comment |Â
up vote
4
down vote
favorite
up vote
4
down vote
favorite
This Python script takes a directory of CSV files and calls a Scala script which tests whether a file's contents match a given regular expression. The link to this Scala script can be found in the 3rd line.
That script takes a single file as argument, hence my creation of this Python script to feed it an entire directory worth of files, while also utilizing the maximum possible amount of CPU power.
Do you see any issues or potential improvements in my code?
"""
Command line API to CSV validator using Scala implementation from:
http://digital-preservation.github.io/csv-validator/#toc7
"""
PATH_TO_VALIDATOR = r"C:progcsvcsv-validator-cmd-1.2-RC2binvalidate.bat"
PATH_TO_CSV_FOLDER = r"C:progcsvCSVFiles"
PATH_TO_CSV_SCHEMA = r"C:progcsvocr-schema.csvs"
# Set defaults
CSV_ENCODING = "windows-1252"
CSV_SCHEMA_ENCODING = "UTF-8"
def open_csv(CSV_LIST):
import subprocess
# To be used to display a simple progress indicator
TOTAL_FILE_COUNT = len(CSV_LIST)
current_file_count = 1
with open("output.txt", 'w') as output:
for filename in CSV_LIST:
print("Processing file " + str(current_file_count) + "/" + str(TOTAL_FILE_COUNT))
output.write(filename + ': ')
validator = subprocess.Popen(
[PATH_TO_VALIDATOR, PATH_TO_CSV_FOLDER + "/" + filename, PATH_TO_CSV_SCHEMA, "--csv-encoding",
CSV_ENCODING, "--csv-schema-encoding", CSV_SCHEMA_ENCODING, '--fail-fast', 'true'], stdout=subprocess.PIPE)
result = validator.stdout.read()
output.write(result.decode('windows-1252'))
current_file_count += 1
# Split a list into n sublists of roughly equal size
def split_list(alist, wanted_parts=1):
length = len(alist)
return [alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
for i in range(wanted_parts)]
if __name__ == '__main__':
import argparse
import multiprocessing
import os
parser = argparse.ArgumentParser(description="Command line API to Scala CSV validator")
parser.add_argument('-pv', '--PATH_TO_VALIDATOR', help="Specify the path to csv-validator-cmd/bin/validator.bat",
required=True)
parser.add_argument('-pf', '--PATH_TO_CSV_FOLDER', help="Specify the path to the folder containing the csv files "
"you want to validate", required=True)
parser.add_argument('-ps', '--PATH_TO_CSV_SCHEMA', help="Specify the path to CSV schema you want to use to "
"validate the given files", required=True)
parser.add_argument('-cenc', '--CSV_ENCODING', help="Optional parameter to specify the encoding used by the CSV "
"files. Choose UTF-8 or windows-1252. Default windows-1252")
parser.add_argument('-csenc', '--CSV_SCHEMA_ENCODING', help="Optional parameter to specify the encoding used by "
"the CSV Schema. Choose UTF-8 or windows-1252. "
"Default UTF-8")
args = vars(parser.parse_args())
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
if args['CSV_SCHEMA_ENCODING'] is not None:
CSV_SCHEMA_ENCODING = args['CSV_SCHEMA_ENCODING']
PATH_TO_VALIDATOR = args["PATH_TO_VALIDATOR"]
PATH_TO_CSV_SCHEMA = args["PATH_TO_CSV_SCHEMA"]
PATH_TO_CSV_FOLDER = args["PATH_TO_CSV_FOLDER"]
CPU_COUNT = multiprocessing.cpu_count()
split_csv_directory = split_list(os.listdir(args["PATH_TO_CSV_FOLDER"]), wanted_parts=CPU_COUNT)
# Spawn a Process for each CPU on the system
for csv_list in split_csv_directory:
p = multiprocessing.Process(target=open_csv, args=(csv_list,))
p.start()
python child-process multiprocessing
This Python script takes a directory of CSV files and calls a Scala script which tests whether a file's contents match a given regular expression. The link to this Scala script can be found in the 3rd line.
That script takes a single file as argument, hence my creation of this Python script to feed it an entire directory worth of files, while also utilizing the maximum possible amount of CPU power.
Do you see any issues or potential improvements in my code?
"""
Command line API to CSV validator using Scala implementation from:
http://digital-preservation.github.io/csv-validator/#toc7
"""
PATH_TO_VALIDATOR = r"C:progcsvcsv-validator-cmd-1.2-RC2binvalidate.bat"
PATH_TO_CSV_FOLDER = r"C:progcsvCSVFiles"
PATH_TO_CSV_SCHEMA = r"C:progcsvocr-schema.csvs"
# Set defaults
CSV_ENCODING = "windows-1252"
CSV_SCHEMA_ENCODING = "UTF-8"
def open_csv(CSV_LIST):
import subprocess
# To be used to display a simple progress indicator
TOTAL_FILE_COUNT = len(CSV_LIST)
current_file_count = 1
with open("output.txt", 'w') as output:
for filename in CSV_LIST:
print("Processing file " + str(current_file_count) + "/" + str(TOTAL_FILE_COUNT))
output.write(filename + ': ')
validator = subprocess.Popen(
[PATH_TO_VALIDATOR, PATH_TO_CSV_FOLDER + "/" + filename, PATH_TO_CSV_SCHEMA, "--csv-encoding",
CSV_ENCODING, "--csv-schema-encoding", CSV_SCHEMA_ENCODING, '--fail-fast', 'true'], stdout=subprocess.PIPE)
result = validator.stdout.read()
output.write(result.decode('windows-1252'))
current_file_count += 1
# Split a list into n sublists of roughly equal size
def split_list(alist, wanted_parts=1):
length = len(alist)
return [alist[i * length // wanted_parts: (i + 1) * length // wanted_parts]
for i in range(wanted_parts)]
if __name__ == '__main__':
import argparse
import multiprocessing
import os
parser = argparse.ArgumentParser(description="Command line API to Scala CSV validator")
parser.add_argument('-pv', '--PATH_TO_VALIDATOR', help="Specify the path to csv-validator-cmd/bin/validator.bat",
required=True)
parser.add_argument('-pf', '--PATH_TO_CSV_FOLDER', help="Specify the path to the folder containing the csv files "
"you want to validate", required=True)
parser.add_argument('-ps', '--PATH_TO_CSV_SCHEMA', help="Specify the path to CSV schema you want to use to "
"validate the given files", required=True)
parser.add_argument('-cenc', '--CSV_ENCODING', help="Optional parameter to specify the encoding used by the CSV "
"files. Choose UTF-8 or windows-1252. Default windows-1252")
parser.add_argument('-csenc', '--CSV_SCHEMA_ENCODING', help="Optional parameter to specify the encoding used by "
"the CSV Schema. Choose UTF-8 or windows-1252. "
"Default UTF-8")
args = vars(parser.parse_args())
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
if args['CSV_SCHEMA_ENCODING'] is not None:
CSV_SCHEMA_ENCODING = args['CSV_SCHEMA_ENCODING']
PATH_TO_VALIDATOR = args["PATH_TO_VALIDATOR"]
PATH_TO_CSV_SCHEMA = args["PATH_TO_CSV_SCHEMA"]
PATH_TO_CSV_FOLDER = args["PATH_TO_CSV_FOLDER"]
CPU_COUNT = multiprocessing.cpu_count()
split_csv_directory = split_list(os.listdir(args["PATH_TO_CSV_FOLDER"]), wanted_parts=CPU_COUNT)
# Spawn a Process for each CPU on the system
for csv_list in split_csv_directory:
p = multiprocessing.Process(target=open_csv, args=(csv_list,))
p.start()
python child-process multiprocessing
edited Jan 29 at 11:27
Mast
7,33663484
7,33663484
asked Jan 27 at 18:09
DreamIT
212
212
2
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27
add a comment |Â
2
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27
2
2
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27
add a comment |Â
1 Answer
1
active
oldest
votes
up vote
1
down vote
Only a few small things I would suggest changing.
Pep8
You should consider formatting your code in accordance with pep8. This is important when sharing code, as the consistent style makes it much easier for other programmers to read your code. There are various tools available to assist in making the code pep8 compliant. I use the PyCharm IDE which will show pep8 violations right in the editor.
ALL_CAPS_IS_FOR_CONSTS
So generally ALL_CAPS is reserved for constant variables. So this sort of thing:
TOTAL_FILE_COUNT = len(CSV_LIST)
is more Pythonic as:
total_file_count = len(CSV_LIST)
And even better is getting rid of this intermediate assignment altogether with:
print("Processing file /".format(current_file_count, len(CSV_LIST)))
Testing for presence in dicts
Often when you find code checking for a key presence in a dict before performing some work:
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
Basically the same thing can be done without the explicit check:
CSV_ENCODING = args.get('CSV_ENCODING', CSV_ENCODING)
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
1
down vote
Only a few small things I would suggest changing.
Pep8
You should consider formatting your code in accordance with pep8. This is important when sharing code, as the consistent style makes it much easier for other programmers to read your code. There are various tools available to assist in making the code pep8 compliant. I use the PyCharm IDE which will show pep8 violations right in the editor.
ALL_CAPS_IS_FOR_CONSTS
So generally ALL_CAPS is reserved for constant variables. So this sort of thing:
TOTAL_FILE_COUNT = len(CSV_LIST)
is more Pythonic as:
total_file_count = len(CSV_LIST)
And even better is getting rid of this intermediate assignment altogether with:
print("Processing file /".format(current_file_count, len(CSV_LIST)))
Testing for presence in dicts
Often when you find code checking for a key presence in a dict before performing some work:
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
Basically the same thing can be done without the explicit check:
CSV_ENCODING = args.get('CSV_ENCODING', CSV_ENCODING)
add a comment |Â
up vote
1
down vote
Only a few small things I would suggest changing.
Pep8
You should consider formatting your code in accordance with pep8. This is important when sharing code, as the consistent style makes it much easier for other programmers to read your code. There are various tools available to assist in making the code pep8 compliant. I use the PyCharm IDE which will show pep8 violations right in the editor.
ALL_CAPS_IS_FOR_CONSTS
So generally ALL_CAPS is reserved for constant variables. So this sort of thing:
TOTAL_FILE_COUNT = len(CSV_LIST)
is more Pythonic as:
total_file_count = len(CSV_LIST)
And even better is getting rid of this intermediate assignment altogether with:
print("Processing file /".format(current_file_count, len(CSV_LIST)))
Testing for presence in dicts
Often when you find code checking for a key presence in a dict before performing some work:
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
Basically the same thing can be done without the explicit check:
CSV_ENCODING = args.get('CSV_ENCODING', CSV_ENCODING)
add a comment |Â
up vote
1
down vote
up vote
1
down vote
Only a few small things I would suggest changing.
Pep8
You should consider formatting your code in accordance with pep8. This is important when sharing code, as the consistent style makes it much easier for other programmers to read your code. There are various tools available to assist in making the code pep8 compliant. I use the PyCharm IDE which will show pep8 violations right in the editor.
ALL_CAPS_IS_FOR_CONSTS
So generally ALL_CAPS is reserved for constant variables. So this sort of thing:
TOTAL_FILE_COUNT = len(CSV_LIST)
is more Pythonic as:
total_file_count = len(CSV_LIST)
And even better is getting rid of this intermediate assignment altogether with:
print("Processing file /".format(current_file_count, len(CSV_LIST)))
Testing for presence in dicts
Often when you find code checking for a key presence in a dict before performing some work:
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
Basically the same thing can be done without the explicit check:
CSV_ENCODING = args.get('CSV_ENCODING', CSV_ENCODING)
Only a few small things I would suggest changing.
Pep8
You should consider formatting your code in accordance with pep8. This is important when sharing code, as the consistent style makes it much easier for other programmers to read your code. There are various tools available to assist in making the code pep8 compliant. I use the PyCharm IDE which will show pep8 violations right in the editor.
ALL_CAPS_IS_FOR_CONSTS
So generally ALL_CAPS is reserved for constant variables. So this sort of thing:
TOTAL_FILE_COUNT = len(CSV_LIST)
is more Pythonic as:
total_file_count = len(CSV_LIST)
And even better is getting rid of this intermediate assignment altogether with:
print("Processing file /".format(current_file_count, len(CSV_LIST)))
Testing for presence in dicts
Often when you find code checking for a key presence in a dict before performing some work:
if args['CSV_ENCODING'] is not None:
CSV_ENCODING = args['CSV_ENCODING']
Basically the same thing can be done without the explicit check:
CSV_ENCODING = args.get('CSV_ENCODING', CSV_ENCODING)
answered Jan 29 at 6:15
Stephen Rauch
3,52051430
3,52051430
add a comment |Â
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f186144%2fsubprocess-csv-validator-and-multiprocessing-on-python%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
2
Please do not update the code in your question to incorporate feedback from answers, doing so goes against the Question + Answer style of Code Review. This is not a forum where you should keep the most updated version in your question. Please see what you may and may not do after receiving answers. Revising questions which have answers gets messy very fast.
â Mast
Jan 29 at 11:27