Extracting text from Bundestag proceedings as directed by XML files, and exporting to JSON

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
3
down vote

favorite

I have a long script here that does the following for me: I have 9000 XML files, from which I parse information, and then in about 7000 TXT files find the information out, and in the end it all save as JSON. Well, aside from what he does, he does it very slowly.

Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.

What can I do against it? I've been looking for solutions like generators, Cprofile, Cython, pypy, and similar, but all too advanced for me. How can I optimize this code to make it run faster? gc.collect() did not help. I think it's more about the algorithm and the flow scheme, rather than what it does.

Sample Files: Files

cProfile Output: cProfile Output

import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io


PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"

PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"

os.chdir(PATH_FINAL_XML)

#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))

gesetzliste = list()

gesetzanzahlliste = list()

zeitliste = list()

br_list = ['Baden-WÃƒÂ¼rttemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
 "Hamburg", "Mecklenburg-Vorpommern",
 "Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
 "Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "ThÃƒÂ¼ringen", "Bundesregierung"]

MdB = "MdB"

pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern

split_pattern_president = r"n+(?=(?:PrÃƒÂ¤sident|PrÃƒÂ¤sidentin|VizeprÃƒÂ¤sident|VizeprÃƒÂ¤sidentin)s*(?:(?:Dr.)*s*[A-ZÃƒÂ–ÃƒÂœÃƒÂ„ÃƒÂŸ]+[a-zÃƒÂ¶ÃƒÂ¤ÃƒÂ¼ÃƒÂŸ]*s*)*:)" # PrÃƒÂ¤sident und VizeprÃƒÂ¤sident im BT

split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*([A-ZÃƒÂœÃƒÂ„ÃƒÂ–X-,./s]*))" # alle auÃƒÂŸer, MdB, Minister, Gastredner etc.

zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN)b)[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ[]/,._Ã¢Â€Â“!?:;'-Ã¢Â€Â”0-9]*s)2,[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ;!.?[]/,':-Ã¢Â€Â”Ã¢Â€Â“0-9]+[.,?!]*?))" # um zurufe zu identifizieren


def two():

 for file in glob.glob("*.xml"): # xml path

 gesetz = dict()

 '''Childnode Lists'''

 schlagworterliste = list()

 sachgebietliste = list()

 drsliste = list()

 plenumliste = list()

 vorgangliste = list()

 speakeritems = list()

 mdb_splitterlist = list()

 mdb_not_splitterlist = list()

 splitterlist = list()

 '''Childnode Lists'''

 file_id = file.replace(".xml", "")
 print("Gesetzentwurf: ", file)

 tree = ET.parse(file)
 root = tree.getroot()

 for child in root:

 gesetz["File_ID"] = file_id

 if str(child.tag) == "SCHLAGWORT":
 schlagworterliste.append(child.text)
 gesetz[child.tag] = schlagworterliste

 elif str(child.tag) == "SACHGEBIET":
 sachgebietliste.append(child.text)
 gesetz[child.tag] = sachgebietliste

 elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
 for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
 for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
 for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
 drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
 child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
 child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text

 drsliste.append(drs)
 gesetz[child.tag] = drsliste

 elif str(child.tag) == "PLENUM":
 for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
 for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
 for child_plenum_nummer in child.findall("PLPR_NUMMER"):
 for child_plenum_seiten in child.findall("PLPR_SEITEN"):
 for child_plenum_link in child.findall("PLPR_LINK"):
 plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
 child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
 child_plenum_nummer.tag:child_plenum_nummer.text,
 child_plenum_seiten.tag:child_plenum_seiten.text,
 child_plenum_link.tag:child_plenum_link.text

 plenumliste.append(plenum)
 gesetz["PLPR"] = plenumliste


 else:
 gesetz[child.tag] = child.text

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 print("CONTINUE")
 continue
 else:
 print("DEVAM")


 for child2 in child:
 for child_zuordnung in child2.findall("ZUORDNUNG"):
 for child_urheber in child2.findall("URHEBER"):

 if "Beratung" in child_urheber.text:
 print("Beratung: ", child_urheber.text)

 for child_fundstelle in child2.findall("FUNDSTELLE"):
 vorgang = child_zuordnung.tag:child_zuordnung.text,
 child_urheber.tag:child_urheber.text,
 child_fundstelle.tag:child_fundstelle.text

 for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
 get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
 get_xml = "".join(get_xml)+".txt"

 for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path

 if get_xml in filename:
 with open(filename, "r") as txtfile:

 check_filename = filename[-9:-4] # for wp8 - wp12

 print("Sitzung:", check_filename+".txt")

 txt = txtfile.read()

 text = re.sub(r"^[sS]*?(?:erÃƒÂ¶ffnet)", "", txt) 

 text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ][A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ]*s*d*|d+)))", " ", text)

 text = re.sub(r"((A)|(B)|(C)|(D))", "", text)

 xml_page_pattern = r"(d+[A-D])"

 page = re.findall(xml_page_pattern, child_fundstelle.text)

 page_begin = 0
 page_end = 0

 if len(page) < 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1

 elif len(page) == 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1


 get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)

 text = get_pages_pattern.findall(text)

 text = "".join(text)


 for child_beschluss in child2.findall("BESCHLUSS"):
 for beschluss_child in child_beschluss:
 vorgang[beschluss_child.tag] = beschluss_child.text

 VORNAME = ""
 NACHNAME = ""
 FRAKTION = ""

 for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):

 speaker = 

 TITEL = ""
 WAHLKREISZUSATZ = ""
 FUNKTION = ""

 ############################################################################################################

 for child in child_speaker:
 speaker[child.tag] = child.text

 if child.tag == "PERSON_TITEL":
 TITEL = child.text

 elif child.tag == "VORNAME":
 VORNAME = child.text

 elif child.tag == "NACHNAME":
 NACHNAME = child.text

 elif child.tag == "WAHLKREISZUSATZ":
 WAHLKREISZUSATZ = child.text

 elif child.tag == "FUNKTION":
 FUNKTION = child.text

 elif child.tag == "FRAKTION":
 FRAKTION = child.text

 else:
 pass


 ############################################################################################################


 splitter_mdb = ""
 splitter_only_name_mdb = ""

 splitter_not_mdb = ""
 splitter_only_name_not_mdb = ""

 ############################################################################################################

 if FRAKTION == "BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN": FRAKTION = "BÃƒÂœNDNISs*90/DIEs*GRÃƒÂœNEN"

 elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"

 elif FRAKTION == "FDP": FRAKTION = "F.D.P."

 ############################################################################################################

 if str(check_filename).startswith(("08", "09", "10", "11")): #WP

 #print("here")
 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 # only Minister etc.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"



 else:
 ############################################################################################################
 ### Splitter Regex ###

 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 ############################################################################################################

 # Minister etc. keine MdBs.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 ### Splitter Regex Ende ###

 ############################################################################################################

 if FUNKTION == MdB:
 mdb_splitterlist.append(splitter_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_splitterlist.append(splitter_only_name_mdb)
 else:
 pass

 elif FUNKTION != MdB:
 mdb_not_splitterlist.append(splitter_not_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_not_splitterlist.append(splitter_only_name_not_mdb)
 else:
 pass


 ############################################################################################################
 splitterlist = mdb_not_splitterlist + mdb_splitterlist


 splitterlist.append(split_pattern_president)
 splitterlist.append(split_rest_pattern)

 str_list = list(filter(None, splitterlist))

 splitted_text = 

 try:

 splitted_text = re.split(pattern='|'.join(str_list), string=text)

 print("Split erfolgreich.")

 except Exception as e:

 print(e)

 with open("logs-me.txt", "a") as logme:
 logme.write(filename+" "+file_id+" "+file+"n")

 logme.close()

 print("ÃƒÂ¼berspringe:", file_id)

 continue


 speeches = 

 ############################################################################################################

 for speechblock in splitted_text:

 for speecher in splitterlist:

 speecher = str(speecher[3:]).replace("=", ":") # regex

 founded_speecher = re.findall(speecher, speechblock)

 if (FUNKTION != MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 elif (FUNKTION == MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
 or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 speakeritems.append(speaker)

 if "REDNER" in vorgang:
 vorgang["REDNER"].append(speaker)
 else:
 vorgang["REDNER"] = [speaker]


 vorgangliste.append(vorgang)
 gesetz["VORGANG"] = vorgangliste

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 pass
 elif gesetz['INITIATIVE'] not in br_list:
 gesetzliste.append(gesetz)
 else:
 print("hier stimmt etwas nicht.")


 '''Write JSON FILE'''
 mpu.io.write("xssy33s.json", gesetzliste)

 print(len(gesetzliste))

two()

edited Jul 12 at 15:05

asked Jul 11 at 11:53

madik_atma

164

Can you add the output of the following command? python -m cProfile -s tottime <filename>.py 100 1
â€“Â Mast
Jul 11 at 12:36

cProfile looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â€“Â Mast
Jul 11 at 12:38

1

You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an in on some growing list or something like that), which is not visible when only running with one file.
â€“Â Graipher
Jul 11 at 12:54

1

understood. @Graipher I will try it now. Thank you for your help.
â€“Â madik_atma
Jul 11 at 12:58

2

When you've profiled your code, please add the output to the question itself, not in the comments.
â€“Â Mast
Jul 11 at 13:01

Â |Â
show 5 more comments

up vote
3
down vote

favorite

Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.

Sample Files: Files

cProfile Output: cProfile Output

import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io


PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"

PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"

os.chdir(PATH_FINAL_XML)

#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))

gesetzliste = list()

gesetzanzahlliste = list()

zeitliste = list()

br_list = ['Baden-WÃƒÂ¼rttemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
 "Hamburg", "Mecklenburg-Vorpommern",
 "Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
 "Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "ThÃƒÂ¼ringen", "Bundesregierung"]

MdB = "MdB"

pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern

split_pattern_president = r"n+(?=(?:PrÃƒÂ¤sident|PrÃƒÂ¤sidentin|VizeprÃƒÂ¤sident|VizeprÃƒÂ¤sidentin)s*(?:(?:Dr.)*s*[A-ZÃƒÂ–ÃƒÂœÃƒÂ„ÃƒÂŸ]+[a-zÃƒÂ¶ÃƒÂ¤ÃƒÂ¼ÃƒÂŸ]*s*)*:)" # PrÃƒÂ¤sident und VizeprÃƒÂ¤sident im BT

split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*([A-ZÃƒÂœÃƒÂ„ÃƒÂ–X-,./s]*))" # alle auÃƒÂŸer, MdB, Minister, Gastredner etc.

zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN)b)[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ[]/,._Ã¢Â€Â“!?:;'-Ã¢Â€Â”0-9]*s)2,[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ;!.?[]/,':-Ã¢Â€Â”Ã¢Â€Â“0-9]+[.,?!]*?))" # um zurufe zu identifizieren


def two():

 for file in glob.glob("*.xml"): # xml path

 gesetz = dict()

 '''Childnode Lists'''

 schlagworterliste = list()

 sachgebietliste = list()

 drsliste = list()

 plenumliste = list()

 vorgangliste = list()

 speakeritems = list()

 mdb_splitterlist = list()

 mdb_not_splitterlist = list()

 splitterlist = list()

 '''Childnode Lists'''

 file_id = file.replace(".xml", "")
 print("Gesetzentwurf: ", file)

 tree = ET.parse(file)
 root = tree.getroot()

 for child in root:

 gesetz["File_ID"] = file_id

 if str(child.tag) == "SCHLAGWORT":
 schlagworterliste.append(child.text)
 gesetz[child.tag] = schlagworterliste

 elif str(child.tag) == "SACHGEBIET":
 sachgebietliste.append(child.text)
 gesetz[child.tag] = sachgebietliste

 elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
 for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
 for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
 for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
 drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
 child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
 child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text

 drsliste.append(drs)
 gesetz[child.tag] = drsliste

 elif str(child.tag) == "PLENUM":
 for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
 for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
 for child_plenum_nummer in child.findall("PLPR_NUMMER"):
 for child_plenum_seiten in child.findall("PLPR_SEITEN"):
 for child_plenum_link in child.findall("PLPR_LINK"):
 plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
 child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
 child_plenum_nummer.tag:child_plenum_nummer.text,
 child_plenum_seiten.tag:child_plenum_seiten.text,
 child_plenum_link.tag:child_plenum_link.text

 plenumliste.append(plenum)
 gesetz["PLPR"] = plenumliste


 else:
 gesetz[child.tag] = child.text

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 print("CONTINUE")
 continue
 else:
 print("DEVAM")


 for child2 in child:
 for child_zuordnung in child2.findall("ZUORDNUNG"):
 for child_urheber in child2.findall("URHEBER"):

 if "Beratung" in child_urheber.text:
 print("Beratung: ", child_urheber.text)

 for child_fundstelle in child2.findall("FUNDSTELLE"):
 vorgang = child_zuordnung.tag:child_zuordnung.text,
 child_urheber.tag:child_urheber.text,
 child_fundstelle.tag:child_fundstelle.text

 for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
 get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
 get_xml = "".join(get_xml)+".txt"

 for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path

 if get_xml in filename:
 with open(filename, "r") as txtfile:

 check_filename = filename[-9:-4] # for wp8 - wp12

 print("Sitzung:", check_filename+".txt")

 txt = txtfile.read()

 text = re.sub(r"^[sS]*?(?:erÃƒÂ¶ffnet)", "", txt) 

 text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ][A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ]*s*d*|d+)))", " ", text)

 text = re.sub(r"((A)|(B)|(C)|(D))", "", text)

 xml_page_pattern = r"(d+[A-D])"

 page = re.findall(xml_page_pattern, child_fundstelle.text)

 page_begin = 0
 page_end = 0

 if len(page) < 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1

 elif len(page) == 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1


 get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)

 text = get_pages_pattern.findall(text)

 text = "".join(text)


 for child_beschluss in child2.findall("BESCHLUSS"):
 for beschluss_child in child_beschluss:
 vorgang[beschluss_child.tag] = beschluss_child.text

 VORNAME = ""
 NACHNAME = ""
 FRAKTION = ""

 for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):

 speaker = 

 TITEL = ""
 WAHLKREISZUSATZ = ""
 FUNKTION = ""

 ############################################################################################################

 for child in child_speaker:
 speaker[child.tag] = child.text

 if child.tag == "PERSON_TITEL":
 TITEL = child.text

 elif child.tag == "VORNAME":
 VORNAME = child.text

 elif child.tag == "NACHNAME":
 NACHNAME = child.text

 elif child.tag == "WAHLKREISZUSATZ":
 WAHLKREISZUSATZ = child.text

 elif child.tag == "FUNKTION":
 FUNKTION = child.text

 elif child.tag == "FRAKTION":
 FRAKTION = child.text

 else:
 pass


 ############################################################################################################


 splitter_mdb = ""
 splitter_only_name_mdb = ""

 splitter_not_mdb = ""
 splitter_only_name_not_mdb = ""

 ############################################################################################################

 if FRAKTION == "BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN": FRAKTION = "BÃƒÂœNDNISs*90/DIEs*GRÃƒÂœNEN"

 elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"

 elif FRAKTION == "FDP": FRAKTION = "F.D.P."

 ############################################################################################################

 if str(check_filename).startswith(("08", "09", "10", "11")): #WP

 #print("here")
 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 # only Minister etc.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"



 else:
 ############################################################################################################
 ### Splitter Regex ###

 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 ############################################################################################################

 # Minister etc. keine MdBs.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 ### Splitter Regex Ende ###

 ############################################################################################################

 if FUNKTION == MdB:
 mdb_splitterlist.append(splitter_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_splitterlist.append(splitter_only_name_mdb)
 else:
 pass

 elif FUNKTION != MdB:
 mdb_not_splitterlist.append(splitter_not_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_not_splitterlist.append(splitter_only_name_not_mdb)
 else:
 pass


 ############################################################################################################
 splitterlist = mdb_not_splitterlist + mdb_splitterlist


 splitterlist.append(split_pattern_president)
 splitterlist.append(split_rest_pattern)

 str_list = list(filter(None, splitterlist))

 splitted_text = 

 try:

 splitted_text = re.split(pattern='|'.join(str_list), string=text)

 print("Split erfolgreich.")

 except Exception as e:

 print(e)

 with open("logs-me.txt", "a") as logme:
 logme.write(filename+" "+file_id+" "+file+"n")

 logme.close()

 print("ÃƒÂ¼berspringe:", file_id)

 continue


 speeches = 

 ############################################################################################################

 for speechblock in splitted_text:

 for speecher in splitterlist:

 speecher = str(speecher[3:]).replace("=", ":") # regex

 founded_speecher = re.findall(speecher, speechblock)

 if (FUNKTION != MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 elif (FUNKTION == MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
 or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 speakeritems.append(speaker)

 if "REDNER" in vorgang:
 vorgang["REDNER"].append(speaker)
 else:
 vorgang["REDNER"] = [speaker]


 vorgangliste.append(vorgang)
 gesetz["VORGANG"] = vorgangliste

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 pass
 elif gesetz['INITIATIVE'] not in br_list:
 gesetzliste.append(gesetz)
 else:
 print("hier stimmt etwas nicht.")


 '''Write JSON FILE'''
 mpu.io.write("xssy33s.json", gesetzliste)

 print(len(gesetzliste))

two()

edited Jul 12 at 15:05

asked Jul 11 at 11:53

madik_atma

164

Can you add the output of the following command? python -m cProfile -s tottime <filename>.py 100 1
â€“Â Mast
Jul 11 at 12:36

cProfile looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â€“Â Mast
Jul 11 at 12:38

1

You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an in on some growing list or something like that), which is not visible when only running with one file.
â€“Â Graipher
Jul 11 at 12:54

1

understood. @Graipher I will try it now. Thank you for your help.
â€“Â madik_atma
Jul 11 at 12:58

2

When you've profiled your code, please add the output to the question itself, not in the comments.
â€“Â Mast
Jul 11 at 13:01

Â |Â
show 5 more comments

up vote
3
down vote

favorite

Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.

Sample Files: Files

cProfile Output: cProfile Output

import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io


PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"

PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"

os.chdir(PATH_FINAL_XML)

#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))

gesetzliste = list()

gesetzanzahlliste = list()

zeitliste = list()

br_list = ['Baden-WÃƒÂ¼rttemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
 "Hamburg", "Mecklenburg-Vorpommern",
 "Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
 "Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "ThÃƒÂ¼ringen", "Bundesregierung"]

MdB = "MdB"

pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern

split_pattern_president = r"n+(?=(?:PrÃƒÂ¤sident|PrÃƒÂ¤sidentin|VizeprÃƒÂ¤sident|VizeprÃƒÂ¤sidentin)s*(?:(?:Dr.)*s*[A-ZÃƒÂ–ÃƒÂœÃƒÂ„ÃƒÂŸ]+[a-zÃƒÂ¶ÃƒÂ¤ÃƒÂ¼ÃƒÂŸ]*s*)*:)" # PrÃƒÂ¤sident und VizeprÃƒÂ¤sident im BT

split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*([A-ZÃƒÂœÃƒÂ„ÃƒÂ–X-,./s]*))" # alle auÃƒÂŸer, MdB, Minister, Gastredner etc.

zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN)b)[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ[]/,._Ã¢Â€Â“!?:;'-Ã¢Â€Â”0-9]*s)2,[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ;!.?[]/,':-Ã¢Â€Â”Ã¢Â€Â“0-9]+[.,?!]*?))" # um zurufe zu identifizieren


def two():

 for file in glob.glob("*.xml"): # xml path

 gesetz = dict()

 '''Childnode Lists'''

 schlagworterliste = list()

 sachgebietliste = list()

 drsliste = list()

 plenumliste = list()

 vorgangliste = list()

 speakeritems = list()

 mdb_splitterlist = list()

 mdb_not_splitterlist = list()

 splitterlist = list()

 '''Childnode Lists'''

 file_id = file.replace(".xml", "")
 print("Gesetzentwurf: ", file)

 tree = ET.parse(file)
 root = tree.getroot()

 for child in root:

 gesetz["File_ID"] = file_id

 if str(child.tag) == "SCHLAGWORT":
 schlagworterliste.append(child.text)
 gesetz[child.tag] = schlagworterliste

 elif str(child.tag) == "SACHGEBIET":
 sachgebietliste.append(child.text)
 gesetz[child.tag] = sachgebietliste

 elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
 for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
 for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
 for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
 drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
 child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
 child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text

 drsliste.append(drs)
 gesetz[child.tag] = drsliste

 elif str(child.tag) == "PLENUM":
 for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
 for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
 for child_plenum_nummer in child.findall("PLPR_NUMMER"):
 for child_plenum_seiten in child.findall("PLPR_SEITEN"):
 for child_plenum_link in child.findall("PLPR_LINK"):
 plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
 child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
 child_plenum_nummer.tag:child_plenum_nummer.text,
 child_plenum_seiten.tag:child_plenum_seiten.text,
 child_plenum_link.tag:child_plenum_link.text

 plenumliste.append(plenum)
 gesetz["PLPR"] = plenumliste


 else:
 gesetz[child.tag] = child.text

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 print("CONTINUE")
 continue
 else:
 print("DEVAM")


 for child2 in child:
 for child_zuordnung in child2.findall("ZUORDNUNG"):
 for child_urheber in child2.findall("URHEBER"):

 if "Beratung" in child_urheber.text:
 print("Beratung: ", child_urheber.text)

 for child_fundstelle in child2.findall("FUNDSTELLE"):
 vorgang = child_zuordnung.tag:child_zuordnung.text,
 child_urheber.tag:child_urheber.text,
 child_fundstelle.tag:child_fundstelle.text

 for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
 get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
 get_xml = "".join(get_xml)+".txt"

 for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path

 if get_xml in filename:
 with open(filename, "r") as txtfile:

 check_filename = filename[-9:-4] # for wp8 - wp12

 print("Sitzung:", check_filename+".txt")

 txt = txtfile.read()

 text = re.sub(r"^[sS]*?(?:erÃƒÂ¶ffnet)", "", txt) 

 text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ][A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ]*s*d*|d+)))", " ", text)

 text = re.sub(r"((A)|(B)|(C)|(D))", "", text)

 xml_page_pattern = r"(d+[A-D])"

 page = re.findall(xml_page_pattern, child_fundstelle.text)

 page_begin = 0
 page_end = 0

 if len(page) < 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1

 elif len(page) == 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1


 get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)

 text = get_pages_pattern.findall(text)

 text = "".join(text)


 for child_beschluss in child2.findall("BESCHLUSS"):
 for beschluss_child in child_beschluss:
 vorgang[beschluss_child.tag] = beschluss_child.text

 VORNAME = ""
 NACHNAME = ""
 FRAKTION = ""

 for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):

 speaker = 

 TITEL = ""
 WAHLKREISZUSATZ = ""
 FUNKTION = ""

 ############################################################################################################

 for child in child_speaker:
 speaker[child.tag] = child.text

 if child.tag == "PERSON_TITEL":
 TITEL = child.text

 elif child.tag == "VORNAME":
 VORNAME = child.text

 elif child.tag == "NACHNAME":
 NACHNAME = child.text

 elif child.tag == "WAHLKREISZUSATZ":
 WAHLKREISZUSATZ = child.text

 elif child.tag == "FUNKTION":
 FUNKTION = child.text

 elif child.tag == "FRAKTION":
 FRAKTION = child.text

 else:
 pass


 ############################################################################################################


 splitter_mdb = ""
 splitter_only_name_mdb = ""

 splitter_not_mdb = ""
 splitter_only_name_not_mdb = ""

 ############################################################################################################

 if FRAKTION == "BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN": FRAKTION = "BÃƒÂœNDNISs*90/DIEs*GRÃƒÂœNEN"

 elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"

 elif FRAKTION == "FDP": FRAKTION = "F.D.P."

 ############################################################################################################

 if str(check_filename).startswith(("08", "09", "10", "11")): #WP

 #print("here")
 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 # only Minister etc.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"



 else:
 ############################################################################################################
 ### Splitter Regex ###

 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 ############################################################################################################

 # Minister etc. keine MdBs.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 ### Splitter Regex Ende ###

 ############################################################################################################

 if FUNKTION == MdB:
 mdb_splitterlist.append(splitter_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_splitterlist.append(splitter_only_name_mdb)
 else:
 pass

 elif FUNKTION != MdB:
 mdb_not_splitterlist.append(splitter_not_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_not_splitterlist.append(splitter_only_name_not_mdb)
 else:
 pass


 ############################################################################################################
 splitterlist = mdb_not_splitterlist + mdb_splitterlist


 splitterlist.append(split_pattern_president)
 splitterlist.append(split_rest_pattern)

 str_list = list(filter(None, splitterlist))

 splitted_text = 

 try:

 splitted_text = re.split(pattern='|'.join(str_list), string=text)

 print("Split erfolgreich.")

 except Exception as e:

 print(e)

 with open("logs-me.txt", "a") as logme:
 logme.write(filename+" "+file_id+" "+file+"n")

 logme.close()

 print("ÃƒÂ¼berspringe:", file_id)

 continue


 speeches = 

 ############################################################################################################

 for speechblock in splitted_text:

 for speecher in splitterlist:

 speecher = str(speecher[3:]).replace("=", ":") # regex

 founded_speecher = re.findall(speecher, speechblock)

 if (FUNKTION != MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 elif (FUNKTION == MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
 or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 speakeritems.append(speaker)

 if "REDNER" in vorgang:
 vorgang["REDNER"].append(speaker)
 else:
 vorgang["REDNER"] = [speaker]


 vorgangliste.append(vorgang)
 gesetz["VORGANG"] = vorgangliste

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 pass
 elif gesetz['INITIATIVE'] not in br_list:
 gesetzliste.append(gesetz)
 else:
 print("hier stimmt etwas nicht.")


 '''Write JSON FILE'''
 mpu.io.write("xssy33s.json", gesetzliste)

 print(len(gesetzliste))

two()

edited Jul 12 at 15:05

asked Jul 11 at 11:53

madik_atma

164

Since I am very new to Python, I did not learn Python from books, but just try it out. The code here is normal fast up to 100 iterations, but from about 120 iterations it gets very slow.

Sample Files: Files

cProfile Output: cProfile Output

import xml.etree.ElementTree as ET
import glob, os, re
import mpu.io


PATH_FINAL_XML = "/Users/x/PycharmProjects/BA/booking-system/final step/xml files"

PATH_FINAL_TXT = "/Users/x/PycharmProjects/BA/booking-system/final step/txt files"

os.chdir(PATH_FINAL_XML)

#path, dirs, files = os.walk(PATH).__next__()
#print("Files", len(files))

gesetzliste = list()

gesetzanzahlliste = list()

zeitliste = list()

br_list = ['Baden-WÃƒÂ¼rttemberg', "Bayern", "Berlin", "Bremen", "Hamburg",
 "Hamburg", "Mecklenburg-Vorpommern",
 "Nordrhein-Westfalen", "Niedersachsen", "Rheinland-Pfalz", "Saarland",
 "Sachsen", "Sachsen-Anhalt", "Schleswig-Holstein", "ThÃƒÂ¼ringen", "Bundesregierung"]

MdB = "MdB"

pdf_xml_pattern = re.compile("(?<=/)d+(?=.pdf)") # PDF Miner Pattern

split_pattern_president = r"n+(?=(?:PrÃƒÂ¤sident|PrÃƒÂ¤sidentin|VizeprÃƒÂ¤sident|VizeprÃƒÂ¤sidentin)s*(?:(?:Dr.)*s*[A-ZÃƒÂ–ÃƒÂœÃƒÂ„ÃƒÂŸ]+[a-zÃƒÂ¶ÃƒÂ¤ÃƒÂ¼ÃƒÂŸ]*s*)*:)" # PrÃƒÂ¤sident und VizeprÃƒÂ¤sident im BT

split_rest_pattern = r"n+(?=(?:Dr.)*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*[A-ZÃƒÂ„ÃƒÂ–ÃƒÂœ]*[a-zÃƒÂ¼ÃƒÂ¤ÃƒÂ¶]*s*([A-ZÃƒÂœÃƒÂ„ÃƒÂ–X-,./s]*))" # alle auÃƒÂŸer, MdB, Minister, Gastredner etc.

zurufe_pattern = r"(((?:(?!bDokumentb|bDrucksacheb|bTagesordnungb|b(BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN)b)[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ[]/,._Ã¢Â€Â“!?:;'-Ã¢Â€Â”0-9]*s)2,[A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ;!.?[]/,':-Ã¢Â€Â”Ã¢Â€Â“0-9]+[.,?!]*?))" # um zurufe zu identifizieren


def two():

 for file in glob.glob("*.xml"): # xml path

 gesetz = dict()

 '''Childnode Lists'''

 schlagworterliste = list()

 sachgebietliste = list()

 drsliste = list()

 plenumliste = list()

 vorgangliste = list()

 speakeritems = list()

 mdb_splitterlist = list()

 mdb_not_splitterlist = list()

 splitterlist = list()

 '''Childnode Lists'''

 file_id = file.replace(".xml", "")
 print("Gesetzentwurf: ", file)

 tree = ET.parse(file)
 root = tree.getroot()

 for child in root:

 gesetz["File_ID"] = file_id

 if str(child.tag) == "SCHLAGWORT":
 schlagworterliste.append(child.text)
 gesetz[child.tag] = schlagworterliste

 elif str(child.tag) == "SACHGEBIET":
 sachgebietliste.append(child.text)
 gesetz[child.tag] = sachgebietliste

 elif str(child.tag) == "WICHTIGE_DRUCKSACHE":
 for child_wichtige_drucksache_herausgeber in child.findall("DRS_HERAUSGEBER"):
 for child_wichtige_drucksache_nummer in child.findall("DRS_NUMMER"):
 for child_wichtige_drucksache_typ in child.findall("DRS_TYP"):
 drs = child_wichtige_drucksache_herausgeber.tag:child_wichtige_drucksache_herausgeber.text,
 child_wichtige_drucksache_nummer.tag:child_wichtige_drucksache_nummer.text,
 child_wichtige_drucksache_typ.tag:child_wichtige_drucksache_typ.text

 drsliste.append(drs)
 gesetz[child.tag] = drsliste

 elif str(child.tag) == "PLENUM":
 for child_plenum_klartext in child.findall("PLPR_KLARTEXT"):
 for child_plenum_herausgeber in child.findall("PLPR_HERAUSGEBER"):
 for child_plenum_nummer in child.findall("PLPR_NUMMER"):
 for child_plenum_seiten in child.findall("PLPR_SEITEN"):
 for child_plenum_link in child.findall("PLPR_LINK"):
 plenum = child_plenum_klartext.tag:child_plenum_klartext.text,
 child_plenum_herausgeber.tag:child_plenum_herausgeber.text,
 child_plenum_nummer.tag:child_plenum_nummer.text,
 child_plenum_seiten.tag:child_plenum_seiten.text,
 child_plenum_link.tag:child_plenum_link.text

 plenumliste.append(plenum)
 gesetz["PLPR"] = plenumliste


 else:
 gesetz[child.tag] = child.text

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 print("CONTINUE")
 continue
 else:
 print("DEVAM")


 for child2 in child:
 for child_zuordnung in child2.findall("ZUORDNUNG"):
 for child_urheber in child2.findall("URHEBER"):

 if "Beratung" in child_urheber.text:
 print("Beratung: ", child_urheber.text)

 for child_fundstelle in child2.findall("FUNDSTELLE"):
 vorgang = child_zuordnung.tag:child_zuordnung.text,
 child_urheber.tag:child_urheber.text,
 child_fundstelle.tag:child_fundstelle.text

 for child_fundstelle_info in child2.findall("FUNDSTELLE_LINK"):
 get_xml = pdf_xml_pattern.findall(str(child_fundstelle_info.text))
 get_xml = "".join(get_xml)+".txt"

 for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')): # txt path

 if get_xml in filename:
 with open(filename, "r") as txtfile:

 check_filename = filename[-9:-4] # for wp8 - wp12

 print("Sitzung:", check_filename+".txt")

 txt = txtfile.read()

 text = re.sub(r"^[sS]*?(?:erÃƒÂ¶ffnet)", "", txt) 

 text = re.sub(r"((cid|(cid):d+)|(Deutschers+Bundestags*D*s*d*W*w*s*D*sd*.s*w*W*w*W*w*W*w*W*d+W*w*W*d(.s+[A-Z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ][A-Za-z_ÃƒÂ¤ÃƒÂ„ÃƒÂ¶ÃƒÂ–ÃƒÂ¼ÃƒÂœÃƒÂŸ]*s*d*|d+)))", " ", text)

 text = re.sub(r"((A)|(B)|(C)|(D))", "", text)

 xml_page_pattern = r"(d+[A-D])"

 page = re.findall(xml_page_pattern, child_fundstelle.text)

 page_begin = 0
 page_end = 0

 if len(page) < 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[0][:-1])+1

 elif len(page) == 2:
 page_begin = int(page[0][:-1])-1; page_end = int(page[1][:-1])+1


 get_pages_pattern = re.compile(r"(?<=)(.*)(?=)".format(page_begin, page_end), flags=re.DOTALL)

 text = get_pages_pattern.findall(text)

 text = "".join(text)


 for child_beschluss in child2.findall("BESCHLUSS"):
 for beschluss_child in child_beschluss:
 vorgang[beschluss_child.tag] = beschluss_child.text

 VORNAME = ""
 NACHNAME = ""
 FRAKTION = ""

 for child_speaker in child2.findall('PERSOENLICHER_URHEBER'):

 speaker = 

 TITEL = ""
 WAHLKREISZUSATZ = ""
 FUNKTION = ""

 ############################################################################################################

 for child in child_speaker:
 speaker[child.tag] = child.text

 if child.tag == "PERSON_TITEL":
 TITEL = child.text

 elif child.tag == "VORNAME":
 VORNAME = child.text

 elif child.tag == "NACHNAME":
 NACHNAME = child.text

 elif child.tag == "WAHLKREISZUSATZ":
 WAHLKREISZUSATZ = child.text

 elif child.tag == "FUNKTION":
 FUNKTION = child.text

 elif child.tag == "FRAKTION":
 FRAKTION = child.text

 else:
 pass


 ############################################################################################################


 splitter_mdb = ""
 splitter_only_name_mdb = ""

 splitter_not_mdb = ""
 splitter_only_name_not_mdb = ""

 ############################################################################################################

 if FRAKTION == "BÃƒÂœNDNIS 90/DIE GRÃƒÂœNEN": FRAKTION = "BÃƒÂœNDNISs*90/DIEs*GRÃƒÂœNEN"

 elif FRAKTION == "CDU/CSU": FRAKTION = "CDU/CSU"

 elif FRAKTION == "FDP": FRAKTION = "F.D.P."

 ############################################################################################################

 if str(check_filename).startswith(("08", "09", "10", "11")): #WP

 #print("here")
 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_only_name_mdb = r"n+(?=" + re.escape(TITEL) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 # only Minister etc.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = r"n+(?=" + re.escape(TITEL) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_only_name_not_mdb = splitter_not_mdb = r"n+(?=" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"



 else:
 ############################################################################################################
 ### Splitter Regex ###

 # only MdBs
 if (not TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (not WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*"+ re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + "".format(FRAKTION) +"))"

 elif (WAHLKREISZUSATZ) and (not TITEL) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 elif (TITEL) and (WAHLKREISZUSATZ) and (FUNKTION == MdB):
 splitter_mdb = r"s+(?=" + re.escape(TITEL) + "s*" + re.escape(VORNAME) + "s*" + re.escape(NACHNAME) + "s*(" + re.escape(WAHLKREISZUSATZ) + ")s*(" + "".format(FRAKTION) +"))"

 ############################################################################################################

 # Minister etc. keine MdBs.
 elif (not FUNKTION == MdB) and (TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(TITEL) + "s+" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 elif (not FUNKTION == MdB) and (not TITEL):
 splitter_not_mdb = r"s+(?=" + re.escape(VORNAME) + "s+" + re.escape(NACHNAME) + r",0,1s*" + "".format(FUNKTION) + r"*b(?:(?!nn)[^:])*:)"

 ### Splitter Regex Ende ###

 ############################################################################################################

 if FUNKTION == MdB:
 mdb_splitterlist.append(splitter_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_splitterlist.append(splitter_only_name_mdb)
 else:
 pass

 elif FUNKTION != MdB:
 mdb_not_splitterlist.append(splitter_not_mdb)
 if str(check_filename).startswith(("08", "09", "10", "11")):
 mdb_not_splitterlist.append(splitter_only_name_not_mdb)
 else:
 pass


 ############################################################################################################
 splitterlist = mdb_not_splitterlist + mdb_splitterlist


 splitterlist.append(split_pattern_president)
 splitterlist.append(split_rest_pattern)

 str_list = list(filter(None, splitterlist))

 splitted_text = 

 try:

 splitted_text = re.split(pattern='|'.join(str_list), string=text)

 print("Split erfolgreich.")

 except Exception as e:

 print(e)

 with open("logs-me.txt", "a") as logme:
 logme.write(filename+" "+file_id+" "+file+"n")

 logme.close()

 print("ÃƒÂ¼berspringe:", file_id)

 continue


 speeches = 

 ############################################################################################################

 for speechblock in splitted_text:

 for speecher in splitterlist:

 speecher = str(speecher[3:]).replace("=", ":") # regex

 founded_speecher = re.findall(speecher, speechblock)

 if (FUNKTION != MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 elif (FUNKTION == MdB):
 if (speaker['VORNAME'] in "".join(founded_speecher) and speaker['NACHNAME'] in "".join(founded_speecher)
 or speaker['NACHNAME'] in "".join(founded_speecher) and speaker['FRAKTION'] in "".join(founded_speecher)):

 speeches.append(speechblock)

 zurufe = re.findall(zurufe_pattern, speechblock, flags=re.DOTALL | re.MULTILINE)

 speech = "TEXT":speeches, "ZURUFE":zurufe

 speaker['INHALT'] = speech

 break #todo -> workaround

 speakeritems.append(speaker)

 if "REDNER" in vorgang:
 vorgang["REDNER"].append(speaker)
 else:
 vorgang["REDNER"] = [speaker]


 vorgangliste.append(vorgang)
 gesetz["VORGANG"] = vorgangliste

 if "INITIATIVE" in gesetz.keys():
 if gesetz['INITIATIVE'] in br_list:
 pass
 elif gesetz['INITIATIVE'] not in br_list:
 gesetzliste.append(gesetz)
 else:
 print("hier stimmt etwas nicht.")


 '''Write JSON FILE'''
 mpu.io.write("xssy33s.json", gesetzliste)

 print(len(gesetzliste))

two()

edited Jul 12 at 15:05

asked Jul 11 at 11:53

madik_atma

164

edited Jul 12 at 15:05

asked Jul 11 at 11:53

madik_atma

164

asked Jul 11 at 11:53

madik_atma

164

asked Jul 11 at 11:53

madik_atma

164

Can you add the output of the following command? python -m cProfile -s tottime <filename>.py 100 1
â€“Â Mast
Jul 11 at 12:36

cProfile looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â€“Â Mast
Jul 11 at 12:38

1

You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an in on some growing list or something like that), which is not visible when only running with one file.
â€“Â Graipher
Jul 11 at 12:54

1

understood. @Graipher I will try it now. Thank you for your help.
â€“Â madik_atma
Jul 11 at 12:58

2

When you've profiled your code, please add the output to the question itself, not in the comments.
â€“Â Mast
Jul 11 at 13:01

Â |Â
show 5 more comments

Can you add the output of the following command? python -m cProfile -s tottime <filename>.py 100 1
â€“Â Mast
Jul 11 at 12:36

cProfile looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â€“Â Mast
Jul 11 at 12:38

1

You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an in on some growing list or something like that), which is not visible when only running with one file.
â€“Â Graipher
Jul 11 at 12:54

1

understood. @Graipher I will try it now. Thank you for your help.
â€“Â madik_atma
Jul 11 at 12:58

2

When you've profiled your code, please add the output to the question itself, not in the comments.
â€“Â Mast
Jul 11 at 13:01

Can you add the output of the following command? python -m cProfile -s tottime <filename>.py 100 1
â€“Â Mast
Jul 11 at 12:36

cProfile looks more complicated than it is. Read the manual and a couple of example uses, mess around a bit on smaller pieces of code and you'll figure it out in no-time at all.
â€“Â Mast
Jul 11 at 12:38

You should run the profiler with more than one file (but less than 1000). Obviously there is something which takes longer for each additional file (probably an in on some growing list or something like that), which is not visible when only running with one file.
â€“Â Graipher
Jul 11 at 12:54

understood. @Graipher I will try it now. Thank you for your help.
â€“Â madik_atma
Jul 11 at 12:58

When you've profiled your code, please add the output to the question itself, not in the comments.
â€“Â Mast
Jul 11 at 13:01

Â |Â
show 5 more comments

1 Answer
1

active

oldest

votes

up vote
3
down vote

Runtime

This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
 if get_xml in filename:
 with open(filename, "r") as txtfile:
 ...

This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
 with open(filename, "r") as txtfile:
 ...

This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
 ...

Style

Currently your code is one gigantic function. This makes it very hard to read (especially because the indentation level becomes very deep). Try to pull out single actions into their own functions, taking all relevant inputs as arguments and returning its results. This also allows you to give these functions names that make it more obvious what happens.

Code Review (and indeed the whole StackExchange network, except for some obvious exceptions, like Stack Overflow en espaÃƒÂ±ol) is in English. This makes it a bit harder to get good reviews since your question contains a lot of German, since it is harder to read for most people.

However, what is actually worse is that you are not sticking to German names, either. You mix German variable names with English variable names (some of which contain grammatical errors, like speecher_list, which is probably supposed to be speakers_list). When programming I usually stick with English names, because it makes distributing your code easier and also it jars less that all the standard functions have English names.

answered Jul 11 at 13:14

Graipher

20.4k42981

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f198280%2fextracting-text-from-bundestag-proceedings-as-directed-by-xml-files-and-exporti%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
3
down vote

Runtime

This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
 if get_xml in filename:
 with open(filename, "r") as txtfile:
 ...

This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
 with open(filename, "r") as txtfile:
 ...

This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
 ...

Style

answered Jul 11 at 13:14

Graipher

20.4k42981

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

add a commentÂ |Â

up vote
3
down vote

Runtime

This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
 if get_xml in filename:
 with open(filename, "r") as txtfile:
 ...

This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
 with open(filename, "r") as txtfile:
 ...

This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
 ...

Style

answered Jul 11 at 13:14

Graipher

20.4k42981

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

add a commentÂ |Â

up vote
3
down vote

Runtime

This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
 if get_xml in filename:
 with open(filename, "r") as txtfile:
 ...

This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
 with open(filename, "r") as txtfile:
 ...

This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
 ...

Style

answered Jul 11 at 13:14

Graipher

20.4k42981

Runtime

This is the only obvious place I could find that looks like it might scale badly with an increasing number of files in the directory:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
get_xml = "".join(get_xml) + ".txt"
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, '*.txt')):
 if get_xml in filename:
 with open(filename, "r") as txtfile:
 ...

This loops over all text files in the directory to find the ones that contains the correct name. Instead, filter as soon as possible:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
for filename in glob.glob(os.path.join(PATH_FINAL_TXT, "**.txt".format("".join(get_xml)))):
 with open(filename, "r") as txtfile:
 ...

This assumes that there are multiple files that can fit. If there is only one, then it becomes even simpler:

get_xml = pdf_xml_pattern.findall(
 str(child_fundstelle_info.text))
# txt path
filename = os.path.join(PATH_FINAL_TXT, "".join(get_xml) + ".txt"):
with open(filename, "r") as txtfile:
 ...

Style

answered Jul 11 at 13:14

Graipher

20.4k42981

answered Jul 11 at 13:14

Graipher

20.4k42981

answered Jul 11 at 13:14

Graipher

20.4k42981

answered Jul 11 at 13:14

Graipher

20.4k42981

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

add a commentÂ |Â

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

I completely agree with all points in your post. I'll be more specific about your points in my future codes. I have just posted the cProfile output of about 6 hours. It looks like the problem lies with the re.split method, because the method gets more than 15 regexes with each iteration by the "|" .join.
â€“Â madik_atma
Jul 12 at 15:07

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr

Extracting text from Bundestag proceedings as directed by XML files, and exporting to JSON

1 Answer
1

Runtime

Style

Your Answer

Post as a guest

1 Answer
1

1 Answer
1

Runtime

Style

Runtime

Style

Runtime

Style

Runtime

Style

Post as a guest

Popular posts from this blog

Chat program with C++ and SFML

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

Extracting text from Bundestag proceedings as directed by XML files, and exporting to JSON

1 Answer 1

Runtime

Style

Your Answer

Sign up or log in

Post as a guest

Post as a guest

1 Answer 1

1 Answer 1

Runtime

Style

Runtime

Style

Runtime

Style

Runtime

Style

Sign up or log in

Post as a guest

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Sign up or log in

Post as a guest

Popular posts from this blog

Chat program with C++ and SFML

Read an image with ADNS2610 optical sensor and Arduino Uno

Read files from a directory using Promises

1 Answer
1

1 Answer
1

1 Answer
1