Extracting text from Bundestag proceedings as directed by XML files, and exporting to JSON
Clash Royale CLAN TAG#URR8PPP
.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;
up vote
3
down vote
favorite
I have a long script here that does the following for me: I have 9000 XML files, from which I parse information, and then in about 7000 TXT files find the information out, and in the end it all save as JSON. Well, aside from what he does, he does it very slowly.
Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.
What can I do against it? I've been looking for solutions like generators, Cprofile, Cython, pypy, and similar, but all too advanced for me. How can I optimize this code to make it run faster? gc.collect()
did not help. I think it's more about the algorithm and the flow scheme, rather than what it does.
Sample Files: Files
cProfile Output: cProfile Output
import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io
PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"
PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"
os.chdir(PATH_FINAL_XML)
#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))
gesetzliste = list()
gesetzanzahlliste = list()
zeitliste = list()
br_list = ['Baden-Württemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
"Hamburg", "Mecklenburg-Vorpommern",
"Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
"Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "Thüringen", "Bundesregierung"]
MdB = "MdB"
pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern
split_pattern_president = r"n+(?=(?:Präsident|Präsidentin|Vizepräsident|Vizepräsidentin)s*(?:(?:Dr.)*s*[A-ZÃÂÃÂÃÂÃÂ]+[a-zöäüÃÂ]*s*)*:)" # Präsident und Vizepräsident im BT
split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*([A-ZÃÂÃÂÃÂX-,./s]*))" # alle auÃÂer, MdB, Minister, Gastredner etc.
zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃÂNDNIS 90/DIE GRÃÂNEN)b)[A-Za-z_äÃÂöÃÂüÃÂÃÂ[]/,._âÂÂ!?:;'-âÂÂ0-9]*s)2,[A-Za-z_äÃÂöÃÂüÃÂÃÂ;!.?[]/,':-âÂÂâÂÂ0-9]+[.,?!]*?))" # um zurufe zu identifizieren
def two():
for file in glob.glob("*.xml"): # xml path
gesetz = dict()
'''Childnode Lists'''
schlagworterliste = list()
sachgebietliste = list()
drsliste = list()
plenumliste = list()
vorgangliste = list()
speakeritems = list()
mdb_splitterlist = list()
mdb_not_splitterlist = list()
splitterlist = list()
'''Childnode Lists'''
file_id = file.replace(".xml", "")
print("Gesetzentwurf: ", file)
tree = ET.parse(file)
root = tree.getroot()
for child in root:
gesetz["File_ID"] = file_id
if str(child.tag) == "SCHLAGWORT":
schlagworterliste.append(child.text)
gesetz[child.tag] = schlagworterliste
elif str(child.tag) == "SACHGEBIET":
sachgebietliste.append(child.text)
gesetz[child.tag] = sachgebietliste
elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text
drsliste.append(drs)
gesetz[child.tag] = drsliste
elif str(child.tag) == "PLENUM":
for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
for child_plenum_nummer in child.findall("PLPR_NUMMER"):
for child_plenum_seiten in child.findall("PLPR_SEITEN"):
for child_plenum_link in child.findall("PLPR_LINK"):
plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
child_plenum_nummer.tag:child_plenum_nummer.text,
child_plenum_seiten.tag:child_plenum_seiten.text,
child_plenum_link.tag:child_plenum_link.text
plenumliste.append(plenum)
gesetz["PLPR"] = plenumliste
else:
gesetz[child.tag] = child.text
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
print("CONTINUE")
continue
else:
print("DEVAM")
for child2 in child:
for child_zuordnung in child2.findall("ZUORDNUNG"):
for child_urheber in child2.findall("URHEBER"):
if "Beratung" in child_urheber.text:
print("Beratung: ", child_urheber.text)
for child_fundstelle in child2.findall("FUNDSTELLE"):
vorgang = child_zuordnung.tag:child_zuordnung.text,
child_urheber.tag:child_urheber.text,
child_fundstelle.tag:child_fundstelle.text
for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
get_xml = "".join(get_xml)+".txt"
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path
if get_xml in filename:
with open(filename, "r") as txtfile:
check_filename = filename[-9:-4] # for wp8 - wp12
print("Sitzung:", check_filename+".txt")
txt = txtfile.read()
text = re.sub(r"^[sS]*?(?:eröffnet)", "", txt)
text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_äÃÂöÃÂüÃÂÃÂ][A-Za-z_äÃÂöÃÂüÃÂÃÂ]*s*d*|d+)))", " ", text)
text = re.sub(r"((A)|(B)|(C)|(D))", "", text)
xml_page_pattern = r"(d+[A-D])"
page = re.findall(xml_page_pattern, child_fundstelle.text)
page_begin = 0
page_end = 0
if len(page) < 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1
elif len(page) == 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1
get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)
text = get_pages_pattern.findall(text)
text = "".join(text)
for child_beschluss in child2.findall("BESCHLUSS"):
for beschluss_child in child_beschluss:
vorgang[beschluss_child.tag] = beschluss_child.text
VORNAME = ""
NACHNAME = ""
FRAKTION = ""
for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):
speaker =
TITEL = ""
WAHLKREISZUSATZ = ""
FUNKTION = ""
############################################################################################################
for child in child_speaker:
speaker[child.tag] = child.text
if child.tag == "PERSON_TITEL":
TITEL = child.text
elif child.tag == "VORNAME":
VORNAME = child.text
elif child.tag == "NACHNAME":
NACHNAME = child.text
elif child.tag == "WAHLKREISZUSATZ":
WAHLKREISZUSATZ = child.text
elif child.tag == "FUNKTION":
FUNKTION = child.text
elif child.tag == "FRAKTION":
FRAKTION = child.text
else:
pass
############################################################################################################
splitter_mdb = ""
splitter_only_name_mdb = ""
splitter_not_mdb = ""
splitter_only_name_not_mdb = ""
############################################################################################################
if FRAKTION == "BÃÂNDNIS 90/DIE GRÃÂNEN": FRAKTION = "BÃÂNDNISs*90/DIEs*GRÃÂNEN"
elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"
elif FRAKTION == "FDP": FRAKTION = "F.D.P."
############################################################################################################
if str(check_filename).startswith(("08", "09", "10", "11")): #WP
#print("here")
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
# only Minister etc.
elif (not FUNKTION == MdB) and (TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
else:
############################################################################################################
### Splitter Regex ###
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
############################################################################################################
# Minister etc. keine MdBs.
elif (not FUNKTION == MdB) and (TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
### Splitter Regex Ende ###
############################################################################################################
if FUNKTION == MdB:
mdb_splitterlist.append(splitter_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_splitterlist.append(splitter_only_name_mdb)
else:
pass
elif FUNKTION != MdB:
mdb_not_splitterlist.append(splitter_not_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_not_splitterlist.append(splitter_only_name_not_mdb)
else:
pass
############################################################################################################
splitterlist = mdb_not_splitterlist + mdb_splitterlist
splitterlist.append(split_pattern_president)
splitterlist.append(split_rest_pattern)
str_list = list(filter(None, splitterlist))
splitted_text =
try:
splitted_text = re.split(pattern='|'.join(str_list), string=text)
print("Split erfolgreich.")
except Exception as e:
print(e)
with open("logs-me.txt", "a") as logme:
logme.write(filename+" "+file_id+" "+file+"n")
logme.close()
print("überspringe:", file_id)
continue
speeches =
############################################################################################################
for speechblock in splitted_text:
for speecher in splitterlist:
speecher = str(speecher[3:]).replace("=", ":") # regex
founded_speecher = re.findall(speecher, speechblock)
if (FUNKTION != MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
elif (FUNKTION == MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
speakeritems.append(speaker)
if "REDNER" in vorgang:
vorgang["REDNER"].append(speaker)
else:
vorgang["REDNER"] = [speaker]
vorgangliste.append(vorgang)
gesetz["VORGANG"] = vorgangliste
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
pass
elif gesetz['INITIATIVE'] not in br_list:
gesetzliste.append(gesetz)
else:
print("hier stimmt etwas nicht.")
'''Write JSON FILE'''
mpu.io.write("xssy33s.json", gesetzliste)
print(len(gesetzliste))
two()
python performance python-3.x json xml
 |Â
show 5 more comments
up vote
3
down vote
favorite
I have a long script here that does the following for me: I have 9000 XML files, from which I parse information, and then in about 7000 TXT files find the information out, and in the end it all save as JSON. Well, aside from what he does, he does it very slowly.
Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.
What can I do against it? I've been looking for solutions like generators, Cprofile, Cython, pypy, and similar, but all too advanced for me. How can I optimize this code to make it run faster? gc.collect()
did not help. I think it's more about the algorithm and the flow scheme, rather than what it does.
Sample Files: Files
cProfile Output: cProfile Output
import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io
PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"
PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"
os.chdir(PATH_FINAL_XML)
#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))
gesetzliste = list()
gesetzanzahlliste = list()
zeitliste = list()
br_list = ['Baden-Württemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
"Hamburg", "Mecklenburg-Vorpommern",
"Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
"Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "Thüringen", "Bundesregierung"]
MdB = "MdB"
pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern
split_pattern_president = r"n+(?=(?:Präsident|Präsidentin|Vizepräsident|Vizepräsidentin)s*(?:(?:Dr.)*s*[A-ZÃÂÃÂÃÂÃÂ]+[a-zöäüÃÂ]*s*)*:)" # Präsident und Vizepräsident im BT
split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*([A-ZÃÂÃÂÃÂX-,./s]*))" # alle auÃÂer, MdB, Minister, Gastredner etc.
zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃÂNDNIS 90/DIE GRÃÂNEN)b)[A-Za-z_äÃÂöÃÂüÃÂÃÂ[]/,._âÂÂ!?:;'-âÂÂ0-9]*s)2,[A-Za-z_äÃÂöÃÂüÃÂÃÂ;!.?[]/,':-âÂÂâÂÂ0-9]+[.,?!]*?))" # um zurufe zu identifizieren
def two():
for file in glob.glob("*.xml"): # xml path
gesetz = dict()
'''Childnode Lists'''
schlagworterliste = list()
sachgebietliste = list()
drsliste = list()
plenumliste = list()
vorgangliste = list()
speakeritems = list()
mdb_splitterlist = list()
mdb_not_splitterlist = list()
splitterlist = list()
'''Childnode Lists'''
file_id = file.replace(".xml", "")
print("Gesetzentwurf: ", file)
tree = ET.parse(file)
root = tree.getroot()
for child in root:
gesetz["File_ID"] = file_id
if str(child.tag) == "SCHLAGWORT":
schlagworterliste.append(child.text)
gesetz[child.tag] = schlagworterliste
elif str(child.tag) == "SACHGEBIET":
sachgebietliste.append(child.text)
gesetz[child.tag] = sachgebietliste
elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text
drsliste.append(drs)
gesetz[child.tag] = drsliste
elif str(child.tag) == "PLENUM":
for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
for child_plenum_nummer in child.findall("PLPR_NUMMER"):
for child_plenum_seiten in child.findall("PLPR_SEITEN"):
for child_plenum_link in child.findall("PLPR_LINK"):
plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
child_plenum_nummer.tag:child_plenum_nummer.text,
child_plenum_seiten.tag:child_plenum_seiten.text,
child_plenum_link.tag:child_plenum_link.text
plenumliste.append(plenum)
gesetz["PLPR"] = plenumliste
else:
gesetz[child.tag] = child.text
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
print("CONTINUE")
continue
else:
print("DEVAM")
for child2 in child:
for child_zuordnung in child2.findall("ZUORDNUNG"):
for child_urheber in child2.findall("URHEBER"):
if "Beratung" in child_urheber.text:
print("Beratung: ", child_urheber.text)
for child_fundstelle in child2.findall("FUNDSTELLE"):
vorgang = child_zuordnung.tag:child_zuordnung.text,
child_urheber.tag:child_urheber.text,
child_fundstelle.tag:child_fundstelle.text
for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
get_xml = "".join(get_xml)+".txt"
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path
if get_xml in filename:
with open(filename, "r") as txtfile:
check_filename = filename[-9:-4] # for wp8 - wp12
print("Sitzung:", check_filename+".txt")
txt = txtfile.read()
text = re.sub(r"^[sS]*?(?:eröffnet)", "", txt)
text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_äÃÂöÃÂüÃÂÃÂ][A-Za-z_äÃÂöÃÂüÃÂÃÂ]*s*d*|d+)))", " ", text)
text = re.sub(r"((A)|(B)|(C)|(D))", "", text)
xml_page_pattern = r"(d+[A-D])"
page = re.findall(xml_page_pattern, child_fundstelle.text)
page_begin = 0
page_end = 0
if len(page) < 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1
elif len(page) == 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1
get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)
text = get_pages_pattern.findall(text)
text = "".join(text)
for child_beschluss in child2.findall("BESCHLUSS"):
for beschluss_child in child_beschluss:
vorgang[beschluss_child.tag] = beschluss_child.text
VORNAME = ""
NACHNAME = ""
FRAKTION = ""
for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):
speaker =
TITEL = ""
WAHLKREISZUSATZ = ""
FUNKTION = ""
############################################################################################################
for child in child_speaker:
speaker[child.tag] = child.text
if child.tag == "PERSON_TITEL":
TITEL = child.text
elif child.tag == "VORNAME":
VORNAME = child.text
elif child.tag == "NACHNAME":
NACHNAME = child.text
elif child.tag == "WAHLKREISZUSATZ":
WAHLKREISZUSATZ = child.text
elif child.tag == "FUNKTION":
FUNKTION = child.text
elif child.tag == "FRAKTION":
FRAKTION = child.text
else:
pass
############################################################################################################
splitter_mdb = ""
splitter_only_name_mdb = ""
splitter_not_mdb = ""
splitter_only_name_not_mdb = ""
############################################################################################################
if FRAKTION == "BÃÂNDNIS 90/DIE GRÃÂNEN": FRAKTION = "BÃÂNDNISs*90/DIEs*GRÃÂNEN"
elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"
elif FRAKTION == "FDP": FRAKTION = "F.D.P."
############################################################################################################
if str(check_filename).startswith(("08", "09", "10", "11")): #WP
#print("here")
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
# only Minister etc.
elif (not FUNKTION == MdB) and (TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
else:
############################################################################################################
### Splitter Regex ###
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
############################################################################################################
# Minister etc. keine MdBs.
elif (not FUNKTION == MdB) and (TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
### Splitter Regex Ende ###
############################################################################################################
if FUNKTION == MdB:
mdb_splitterlist.append(splitter_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_splitterlist.append(splitter_only_name_mdb)
else:
pass
elif FUNKTION != MdB:
mdb_not_splitterlist.append(splitter_not_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_not_splitterlist.append(splitter_only_name_not_mdb)
else:
pass
############################################################################################################
splitterlist = mdb_not_splitterlist + mdb_splitterlist
splitterlist.append(split_pattern_president)
splitterlist.append(split_rest_pattern)
str_list = list(filter(None, splitterlist))
splitted_text =
try:
splitted_text = re.split(pattern='|'.join(str_list), string=text)
print("Split erfolgreich.")
except Exception as e:
print(e)
with open("logs-me.txt", "a") as logme:
logme.write(filename+" "+file_id+" "+file+"n")
logme.close()
print("überspringe:", file_id)
continue
speeches =
############################################################################################################
for speechblock in splitted_text:
for speecher in splitterlist:
speecher = str(speecher[3:]).replace("=", ":") # regex
founded_speecher = re.findall(speecher, speechblock)
if (FUNKTION != MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
elif (FUNKTION == MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
speakeritems.append(speaker)
if "REDNER" in vorgang:
vorgang["REDNER"].append(speaker)
else:
vorgang["REDNER"] = [speaker]
vorgangliste.append(vorgang)
gesetz["VORGANG"] = vorgangliste
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
pass
elif gesetz['INITIATIVE'] not in br_list:
gesetzliste.append(gesetz)
else:
print("hier stimmt etwas nicht.")
'''Write JSON FILE'''
mpu.io.write("xssy33s.json", gesetzliste)
print(len(gesetzliste))
two()
python performance python-3.x json xml
Can you add the output of the following command?python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â Mast
Jul 11 at 12:38
1
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably anin
on some growing list or something like that), which is not visible when only running with one file.
â Graipher
Jul 11 at 12:54
1
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
2
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01
 |Â
show 5 more comments
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I have a long script here that does the following for me: I have 9000 XML files, from which I parse information, and then in about 7000 TXT files find the information out, and in the end it all save as JSON. Well, aside from what he does, he does it very slowly.
Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.
What can I do against it? I've been looking for solutions like generators, Cprofile, Cython, pypy, and similar, but all too advanced for me. How can I optimize this code to make it run faster? gc.collect()
did not help. I think it's more about the algorithm and the flow scheme, rather than what it does.
Sample Files: Files
cProfile Output: cProfile Output
import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io
PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"
PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"
os.chdir(PATH_FINAL_XML)
#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))
gesetzliste = list()
gesetzanzahlliste = list()
zeitliste = list()
br_list = ['Baden-Württemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
"Hamburg", "Mecklenburg-Vorpommern",
"Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
"Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "Thüringen", "Bundesregierung"]
MdB = "MdB"
pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern
split_pattern_president = r"n+(?=(?:Präsident|Präsidentin|Vizepräsident|Vizepräsidentin)s*(?:(?:Dr.)*s*[A-ZÃÂÃÂÃÂÃÂ]+[a-zöäüÃÂ]*s*)*:)" # Präsident und Vizepräsident im BT
split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*([A-ZÃÂÃÂÃÂX-,./s]*))" # alle auÃÂer, MdB, Minister, Gastredner etc.
zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃÂNDNIS 90/DIE GRÃÂNEN)b)[A-Za-z_äÃÂöÃÂüÃÂÃÂ[]/,._âÂÂ!?:;'-âÂÂ0-9]*s)2,[A-Za-z_äÃÂöÃÂüÃÂÃÂ;!.?[]/,':-âÂÂâÂÂ0-9]+[.,?!]*?))" # um zurufe zu identifizieren
def two():
for file in glob.glob("*.xml"): # xml path
gesetz = dict()
'''Childnode Lists'''
schlagworterliste = list()
sachgebietliste = list()
drsliste = list()
plenumliste = list()
vorgangliste = list()
speakeritems = list()
mdb_splitterlist = list()
mdb_not_splitterlist = list()
splitterlist = list()
'''Childnode Lists'''
file_id = file.replace(".xml", "")
print("Gesetzentwurf: ", file)
tree = ET.parse(file)
root = tree.getroot()
for child in root:
gesetz["File_ID"] = file_id
if str(child.tag) == "SCHLAGWORT":
schlagworterliste.append(child.text)
gesetz[child.tag] = schlagworterliste
elif str(child.tag) == "SACHGEBIET":
sachgebietliste.append(child.text)
gesetz[child.tag] = sachgebietliste
elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text
drsliste.append(drs)
gesetz[child.tag] = drsliste
elif str(child.tag) == "PLENUM":
for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
for child_plenum_nummer in child.findall("PLPR_NUMMER"):
for child_plenum_seiten in child.findall("PLPR_SEITEN"):
for child_plenum_link in child.findall("PLPR_LINK"):
plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
child_plenum_nummer.tag:child_plenum_nummer.text,
child_plenum_seiten.tag:child_plenum_seiten.text,
child_plenum_link.tag:child_plenum_link.text
plenumliste.append(plenum)
gesetz["PLPR"] = plenumliste
else:
gesetz[child.tag] = child.text
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
print("CONTINUE")
continue
else:
print("DEVAM")
for child2 in child:
for child_zuordnung in child2.findall("ZUORDNUNG"):
for child_urheber in child2.findall("URHEBER"):
if "Beratung" in child_urheber.text:
print("Beratung: ", child_urheber.text)
for child_fundstelle in child2.findall("FUNDSTELLE"):
vorgang = child_zuordnung.tag:child_zuordnung.text,
child_urheber.tag:child_urheber.text,
child_fundstelle.tag:child_fundstelle.text
for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
get_xml = "".join(get_xml)+".txt"
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path
if get_xml in filename:
with open(filename, "r") as txtfile:
check_filename = filename[-9:-4] # for wp8 - wp12
print("Sitzung:", check_filename+".txt")
txt = txtfile.read()
text = re.sub(r"^[sS]*?(?:eröffnet)", "", txt)
text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_äÃÂöÃÂüÃÂÃÂ][A-Za-z_äÃÂöÃÂüÃÂÃÂ]*s*d*|d+)))", " ", text)
text = re.sub(r"((A)|(B)|(C)|(D))", "", text)
xml_page_pattern = r"(d+[A-D])"
page = re.findall(xml_page_pattern, child_fundstelle.text)
page_begin = 0
page_end = 0
if len(page) < 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1
elif len(page) == 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1
get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)
text = get_pages_pattern.findall(text)
text = "".join(text)
for child_beschluss in child2.findall("BESCHLUSS"):
for beschluss_child in child_beschluss:
vorgang[beschluss_child.tag] = beschluss_child.text
VORNAME = ""
NACHNAME = ""
FRAKTION = ""
for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):
speaker =
TITEL = ""
WAHLKREISZUSATZ = ""
FUNKTION = ""
############################################################################################################
for child in child_speaker:
speaker[child.tag] = child.text
if child.tag == "PERSON_TITEL":
TITEL = child.text
elif child.tag == "VORNAME":
VORNAME = child.text
elif child.tag == "NACHNAME":
NACHNAME = child.text
elif child.tag == "WAHLKREISZUSATZ":
WAHLKREISZUSATZ = child.text
elif child.tag == "FUNKTION":
FUNKTION = child.text
elif child.tag == "FRAKTION":
FRAKTION = child.text
else:
pass
############################################################################################################
splitter_mdb = ""
splitter_only_name_mdb = ""
splitter_not_mdb = ""
splitter_only_name_not_mdb = ""
############################################################################################################
if FRAKTION == "BÃÂNDNIS 90/DIE GRÃÂNEN": FRAKTION = "BÃÂNDNISs*90/DIEs*GRÃÂNEN"
elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"
elif FRAKTION == "FDP": FRAKTION = "F.D.P."
############################################################################################################
if str(check_filename).startswith(("08", "09", "10", "11")): #WP
#print("here")
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
# only Minister etc.
elif (not FUNKTION == MdB) and (TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
else:
############################################################################################################
### Splitter Regex ###
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
############################################################################################################
# Minister etc. keine MdBs.
elif (not FUNKTION == MdB) and (TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
### Splitter Regex Ende ###
############################################################################################################
if FUNKTION == MdB:
mdb_splitterlist.append(splitter_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_splitterlist.append(splitter_only_name_mdb)
else:
pass
elif FUNKTION != MdB:
mdb_not_splitterlist.append(splitter_not_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_not_splitterlist.append(splitter_only_name_not_mdb)
else:
pass
############################################################################################################
splitterlist = mdb_not_splitterlist + mdb_splitterlist
splitterlist.append(split_pattern_president)
splitterlist.append(split_rest_pattern)
str_list = list(filter(None, splitterlist))
splitted_text =
try:
splitted_text = re.split(pattern='|'.join(str_list), string=text)
print("Split erfolgreich.")
except Exception as e:
print(e)
with open("logs-me.txt", "a") as logme:
logme.write(filename+" "+file_id+" "+file+"n")
logme.close()
print("überspringe:", file_id)
continue
speeches =
############################################################################################################
for speechblock in splitted_text:
for speecher in splitterlist:
speecher = str(speecher[3:]).replace("=", ":") # regex
founded_speecher = re.findall(speecher, speechblock)
if (FUNKTION != MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
elif (FUNKTION == MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
speakeritems.append(speaker)
if "REDNER" in vorgang:
vorgang["REDNER"].append(speaker)
else:
vorgang["REDNER"] = [speaker]
vorgangliste.append(vorgang)
gesetz["VORGANG"] = vorgangliste
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
pass
elif gesetz['INITIATIVE'] not in br_list:
gesetzliste.append(gesetz)
else:
print("hier stimmt etwas nicht.")
'''Write JSON FILE'''
mpu.io.write("xssy33s.json", gesetzliste)
print(len(gesetzliste))
two()
python performance python-3.x json xml
I have a long script here that does the following for me: I have 9000 XML files, from which I parse information, and then in about 7000 TXT files find the information out, and in the end it all save as JSON. Well, aside from what he does, he does it very slowly.
Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.
What can I do against it? I've been looking for solutions like generators, Cprofile, Cython, pypy, and similar, but all too advanced for me. How can I optimize this code to make it run faster? gc.collect()
did not help. I think it's more about the algorithm and the flow scheme, rather than what it does.
Sample Files: Files
cProfile Output: cProfile Output
import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io
PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"
PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"
os.chdir(PATH_FINAL_XML)
#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))
gesetzliste = list()
gesetzanzahlliste = list()
zeitliste = list()
br_list = ['Baden-Württemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
"Hamburg", "Mecklenburg-Vorpommern",
"Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
"Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "Thüringen", "Bundesregierung"]
MdB = "MdB"
pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern
split_pattern_president = r"n+(?=(?:Präsident|Präsidentin|Vizepräsident|Vizepräsidentin)s*(?:(?:Dr.)*s*[A-ZÃÂÃÂÃÂÃÂ]+[a-zöäüÃÂ]*s*)*:)" # Präsident und Vizepräsident im BT
split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*[A-ZÃÂÃÂÃÂ]*[a-züäö]*s*([A-ZÃÂÃÂÃÂX-,./s]*))" # alle auÃÂer, MdB, Minister, Gastredner etc.
zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃÂNDNIS 90/DIE GRÃÂNEN)b)[A-Za-z_äÃÂöÃÂüÃÂÃÂ[]/,._âÂÂ!?:;'-âÂÂ0-9]*s)2,[A-Za-z_äÃÂöÃÂüÃÂÃÂ;!.?[]/,':-âÂÂâÂÂ0-9]+[.,?!]*?))" # um zurufe zu identifizieren
def two():
for file in glob.glob("*.xml"): # xml path
gesetz = dict()
'''Childnode Lists'''
schlagworterliste = list()
sachgebietliste = list()
drsliste = list()
plenumliste = list()
vorgangliste = list()
speakeritems = list()
mdb_splitterlist = list()
mdb_not_splitterlist = list()
splitterlist = list()
'''Childnode Lists'''
file_id = file.replace(".xml", "")
print("Gesetzentwurf: ", file)
tree = ET.parse(file)
root = tree.getroot()
for child in root:
gesetz["File_ID"] = file_id
if str(child.tag) == "SCHLAGWORT":
schlagworterliste.append(child.text)
gesetz[child.tag] = schlagworterliste
elif str(child.tag) == "SACHGEBIET":
sachgebietliste.append(child.text)
gesetz[child.tag] = sachgebietliste
elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text
drsliste.append(drs)
gesetz[child.tag] = drsliste
elif str(child.tag) == "PLENUM":
for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
for child_plenum_nummer in child.findall("PLPR_NUMMER"):
for child_plenum_seiten in child.findall("PLPR_SEITEN"):
for child_plenum_link in child.findall("PLPR_LINK"):
plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
child_plenum_nummer.tag:child_plenum_nummer.text,
child_plenum_seiten.tag:child_plenum_seiten.text,
child_plenum_link.tag:child_plenum_link.text
plenumliste.append(plenum)
gesetz["PLPR"] = plenumliste
else:
gesetz[child.tag] = child.text
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
print("CONTINUE")
continue
else:
print("DEVAM")
for child2 in child:
for child_zuordnung in child2.findall("ZUORDNUNG"):
for child_urheber in child2.findall("URHEBER"):
if "Beratung" in child_urheber.text:
print("Beratung: ", child_urheber.text)
for child_fundstelle in child2.findall("FUNDSTELLE"):
vorgang = child_zuordnung.tag:child_zuordnung.text,
child_urheber.tag:child_urheber.text,
child_fundstelle.tag:child_fundstelle.text
for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
get_xml = "".join(get_xml)+".txt"
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path
if get_xml in filename:
with open(filename, "r") as txtfile:
check_filename = filename[-9:-4] # for wp8 - wp12
print("Sitzung:", check_filename+".txt")
txt = txtfile.read()
text = re.sub(r"^[sS]*?(?:eröffnet)", "", txt)
text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_äÃÂöÃÂüÃÂÃÂ][A-Za-z_äÃÂöÃÂüÃÂÃÂ]*s*d*|d+)))", " ", text)
text = re.sub(r"((A)|(B)|(C)|(D))", "", text)
xml_page_pattern = r"(d+[A-D])"
page = re.findall(xml_page_pattern, child_fundstelle.text)
page_begin = 0
page_end = 0
if len(page) < 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1
elif len(page) == 2:
page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1
get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)
text = get_pages_pattern.findall(text)
text = "".join(text)
for child_beschluss in child2.findall("BESCHLUSS"):
for beschluss_child in child_beschluss:
vorgang[beschluss_child.tag] = beschluss_child.text
VORNAME = ""
NACHNAME = ""
FRAKTION = ""
for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):
speaker =
TITEL = ""
WAHLKREISZUSATZ = ""
FUNKTION = ""
############################################################################################################
for child in child_speaker:
speaker[child.tag] = child.text
if child.tag == "PERSON_TITEL":
TITEL = child.text
elif child.tag == "VORNAME":
VORNAME = child.text
elif child.tag == "NACHNAME":
NACHNAME = child.text
elif child.tag == "WAHLKREISZUSATZ":
WAHLKREISZUSATZ = child.text
elif child.tag == "FUNKTION":
FUNKTION = child.text
elif child.tag == "FRAKTION":
FRAKTION = child.text
else:
pass
############################################################################################################
splitter_mdb = ""
splitter_only_name_mdb = ""
splitter_not_mdb = ""
splitter_only_name_not_mdb = ""
############################################################################################################
if FRAKTION == "BÃÂNDNIS 90/DIE GRÃÂNEN": FRAKTION = "BÃÂNDNISs*90/DIEs*GRÃÂNEN"
elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"
elif FRAKTION == "FDP": FRAKTION = "F.D.P."
############################################################################################################
if str(check_filename).startswith(("08", "09", "10", "11")): #WP
#print("here")
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
# only Minister etc.
elif (not FUNKTION == MdB) and (TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
else:
############################################################################################################
### Splitter Regex ###
# only MdBs
if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"
elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"
############################################################################################################
# Minister etc. keine MdBs.
elif (not FUNKTION == MdB) and (TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
elif (not FUNKTION == MdB) and (not TITEL):
splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"
### Splitter Regex Ende ###
############################################################################################################
if FUNKTION == MdB:
mdb_splitterlist.append(splitter_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_splitterlist.append(splitter_only_name_mdb)
else:
pass
elif FUNKTION != MdB:
mdb_not_splitterlist.append(splitter_not_mdb)
if str(check_filename).startswith(("08", "09", "10", "11")):
mdb_not_splitterlist.append(splitter_only_name_not_mdb)
else:
pass
############################################################################################################
splitterlist = mdb_not_splitterlist + mdb_splitterlist
splitterlist.append(split_pattern_president)
splitterlist.append(split_rest_pattern)
str_list = list(filter(None, splitterlist))
splitted_text =
try:
splitted_text = re.split(pattern='|'.join(str_list), string=text)
print("Split erfolgreich.")
except Exception as e:
print(e)
with open("logs-me.txt", "a") as logme:
logme.write(filename+" "+file_id+" "+file+"n")
logme.close()
print("überspringe:", file_id)
continue
speeches =
############################################################################################################
for speechblock in splitted_text:
for speecher in splitterlist:
speecher = str(speecher[3:]).replace("=", ":") # regex
founded_speecher = re.findall(speecher, speechblock)
if (FUNKTION != MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
elif (FUNKTION == MdB):
if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):
speeches.append(speechblock)
zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)
speech = "TEXT":speeches, "ZURUFE":zurufe
speaker['INHALT'] = speech
break #todo -> workaround
speakeritems.append(speaker)
if "REDNER" in vorgang:
vorgang["REDNER"].append(speaker)
else:
vorgang["REDNER"] = [speaker]
vorgangliste.append(vorgang)
gesetz["VORGANG"] = vorgangliste
if "INITIATIVE" in gesetz.keys():
if gesetz['INITIATIVE'] in br_list:
pass
elif gesetz['INITIATIVE'] not in br_list:
gesetzliste.append(gesetz)
else:
print("hier stimmt etwas nicht.")
'''Write JSON FILE'''
mpu.io.write("xssy33s.json", gesetzliste)
print(len(gesetzliste))
two()
python performance python-3.x json xml
edited Jul 12 at 15:05
asked Jul 11 at 11:53
madik_atma
164
164
Can you add the output of the following command?python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â Mast
Jul 11 at 12:38
1
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably anin
on some growing list or something like that), which is not visible when only running with one file.
â Graipher
Jul 11 at 12:54
1
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
2
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01
 |Â
show 5 more comments
Can you add the output of the following command?python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â Mast
Jul 11 at 12:38
1
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably anin
on some growing list or something like that), which is not visible when only running with one file.
â Graipher
Jul 11 at 12:54
1
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
2
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01
Can you add the output of the following command?
python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
Can you add the output of the following command?
python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.â Mast
Jul 11 at 12:38
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.â Mast
Jul 11 at 12:38
1
1
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an
in
on some growing list or something like that), which is not visible when only running with one file.â Graipher
Jul 11 at 12:54
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an
in
on some growing list or something like that), which is not visible when only running with one file.â Graipher
Jul 11 at 12:54
1
1
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
2
2
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01
 |Â
show 5 more comments
1 Answer
1
active
oldest
votes
up vote
3
down vote
Runtime
This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
if get_xml in filename:
with open(filename, "r") as txtfile:
...
This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
with open(filename, "r") as txtfile:
...
This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
...
Style
Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.
Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en español) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.
However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list
, which is probably supposed to be speakers_list
). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
add a comment |Â
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
3
down vote
Runtime
This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
if get_xml in filename:
with open(filename, "r") as txtfile:
...
This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
with open(filename, "r") as txtfile:
...
This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
...
Style
Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.
Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en español) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.
However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list
, which is probably supposed to be speakers_list
). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
add a comment |Â
up vote
3
down vote
Runtime
This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
if get_xml in filename:
with open(filename, "r") as txtfile:
...
This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
with open(filename, "r") as txtfile:
...
This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
...
Style
Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.
Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en español) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.
However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list
, which is probably supposed to be speakers_list
). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
add a comment |Â
up vote
3
down vote
up vote
3
down vote
Runtime
This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
if get_xml in filename:
with open(filename, "r") as txtfile:
...
This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
with open(filename, "r") as txtfile:
...
This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
...
Style
Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.
Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en español) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.
However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list
, which is probably supposed to be speakers_list
). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.
Runtime
This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
if get_xml in filename:
with open(filename, "r") as txtfile:
...
This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
with open(filename, "r") as txtfile:
...
This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:
get_xml = pdf_xml_pattern.findall(
str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
...
Style
Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.
Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en español) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.
However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list
, which is probably supposed to be speakers_list
). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.
answered Jul 11 at 13:14
Graipher
20.4k42981
20.4k42981
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
add a comment |Â
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â madik_atma
Jul 12 at 15:07
add a comment |Â
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198280%2fextracting-text-from-bundestag-proceedings-as-directed-by-xml-files-and-exporti%23new-answer', 'question_page');
);
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function ()
StackExchange.helpers.onClickDraftSave('#login-link');
);
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Can you add the output of the following command?
python -m cProfile -s tottime <filename>.py 100 1
â Mast
Jul 11 at 12:36
cProfile
looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.â Mast
Jul 11 at 12:38
1
You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an
in
on some growing list or something like that), which is not visible when only running with one file.â Graipher
Jul 11 at 12:54
1
understood. @Graipher I will try it now. Thank you for your help.
â madik_atma
Jul 11 at 12:58
2
When you've profiled your code, please add the output to the question itself, not in the comments.
â Mast
Jul 11 at 13:01