Preprocess HTML input and generate word count

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty margin-bottom:0;

up vote
5
down vote

favorite

Below you see the product of my first baby-steps in programming. The purpose of the script is twofold:
1. Take html input of a specific website, process it, and return relevant info such as document id, text, and headline.
2. Generate a count of the words in all the articles.

The script is working and does what it is supposed to, however, I cannot help but feel that I'm missing a lot in terms of performance.

import re
import pandas as pd
from urllib.request import urlopen as uReq
from sklearn.feature_extraction.text import CountVectorizer

TAG_RE = re.compile(r'<[^>]+>')
def RemoveTags(text):
 """Remove all html tags"""
 return TAG_RE.sub('', text)

ESCAPES_RE = re.compile(r'\.')
def RemoveEscapes(text):
 """Remove extra escape characters from encoding"""
 return ESCAPES_RE.sub('', text)

def ReadFromLink(link):
 """Read html from link and return raw html"""
 with uReq(link) as response:
 html = response.read()#
 html = str(html).lower()
 return html.lower()

def ArticleRaw(html):
 """Find articles in html"""
 article = re.findall(r'<doc>.*?</doc>', html)
 return article

def GetDocID(html):
 """Find document ids in html"""
 docid = re.findall(r'<docid>(.*?)</docid>', html)
 docid = [docid.strip() for docid in docid]
 docid = [int(docid) for docid in docid] 
 return docid

def GetHeadline(html):
 """Find headlines in html"""
 headline = re.findall(r'<headline>(.*?)</headline>', html)
 headline = [RemoveTags(headline) for headline in headline]
 headline = [RemoveEscapes(headline) for headline in headline]
 return headline 

def GetMainText(html):
 """Find maintext in html"""
 maintext = re.findall(r'<text>(.*?)</text>', html)
 maintext = [RemoveTags(maintext) for maintext in maintext]
 maintext = [RemoveEscapes(maintext) for maintext in maintext]
 maintext = [' '.join(maintext.split()) for maintext in maintext]
 return maintext

link = link
html = ReadFromLink(link)

ArticlesDict = 
 "docid": GetDocID(html), 
 "raw_article": ArticleRaw(html), 
 "headline": GetHeadline(html), 
 "maintext": GetMainText(html)


def CountFeatures(text):
 documents = ArticlesDict['maintext']
 # Stem first?
 vector = CountVectorizer()
 x = vector.fit_transform(documents)
 df_features = pd.DataFrame(x.toarray(), columns = vector.get_feature_names())
 return df_features

df_features = CountFeatures(df_articles['maintext'])

edited Mar 2 at 19:15

200_success

123k14142399

asked Mar 2 at 19:09

Daniel Hansen

384

add a commentÂ |Â

up vote
5
down vote

favorite

The script is working and does what it is supposed to, however, I cannot help but feel that I'm missing a lot in terms of performance.

import re
import pandas as pd
from urllib.request import urlopen as uReq
from sklearn.feature_extraction.text import CountVectorizer

TAG_RE = re.compile(r'<[^>]+>')
def RemoveTags(text):
 """Remove all html tags"""
 return TAG_RE.sub('', text)

ESCAPES_RE = re.compile(r'\.')
def RemoveEscapes(text):
 """Remove extra escape characters from encoding"""
 return ESCAPES_RE.sub('', text)

def ReadFromLink(link):
 """Read html from link and return raw html"""
 with uReq(link) as response:
 html = response.read()#
 html = str(html).lower()
 return html.lower()

def ArticleRaw(html):
 """Find articles in html"""
 article = re.findall(r'<doc>.*?</doc>', html)
 return article

def GetDocID(html):
 """Find document ids in html"""
 docid = re.findall(r'<docid>(.*?)</docid>', html)
 docid = [docid.strip() for docid in docid]
 docid = [int(docid) for docid in docid] 
 return docid

def GetHeadline(html):
 """Find headlines in html"""
 headline = re.findall(r'<headline>(.*?)</headline>', html)
 headline = [RemoveTags(headline) for headline in headline]
 headline = [RemoveEscapes(headline) for headline in headline]
 return headline 

def GetMainText(html):
 """Find maintext in html"""
 maintext = re.findall(r'<text>(.*?)</text>', html)
 maintext = [RemoveTags(maintext) for maintext in maintext]
 maintext = [RemoveEscapes(maintext) for maintext in maintext]
 maintext = [' '.join(maintext.split()) for maintext in maintext]
 return maintext

link = link
html = ReadFromLink(link)

ArticlesDict = 
 "docid": GetDocID(html), 
 "raw_article": ArticleRaw(html), 
 "headline": GetHeadline(html), 
 "maintext": GetMainText(html)


def CountFeatures(text):
 documents = ArticlesDict['maintext']
 # Stem first?
 vector = CountVectorizer()
 x = vector.fit_transform(documents)
 df_features = pd.DataFrame(x.toarray(), columns = vector.get_feature_names())
 return df_features

df_features = CountFeatures(df_articles['maintext'])

edited Mar 2 at 19:15

200_success

123k14142399

asked Mar 2 at 19:09

Daniel Hansen

384

add a commentÂ |Â

up vote
5
down vote

favorite

The script is working and does what it is supposed to, however, I cannot help but feel that I'm missing a lot in terms of performance.

import re
import pandas as pd
from urllib.request import urlopen as uReq
from sklearn.feature_extraction.text import CountVectorizer

TAG_RE = re.compile(r'<[^>]+>')
def RemoveTags(text):
 """Remove all html tags"""
 return TAG_RE.sub('', text)

ESCAPES_RE = re.compile(r'\.')
def RemoveEscapes(text):
 """Remove extra escape characters from encoding"""
 return ESCAPES_RE.sub('', text)

def ReadFromLink(link):
 """Read html from link and return raw html"""
 with uReq(link) as response:
 html = response.read()#
 html = str(html).lower()
 return html.lower()

def ArticleRaw(html):
 """Find articles in html"""
 article = re.findall(r'<doc>.*?</doc>', html)
 return article

def GetDocID(html):
 """Find document ids in html"""
 docid = re.findall(r'<docid>(.*?)</docid>', html)
 docid = [docid.strip() for docid in docid]
 docid = [int(docid) for docid in docid] 
 return docid

def GetHeadline(html):
 """Find headlines in html"""
 headline = re.findall(r'<headline>(.*?)</headline>', html)
 headline = [RemoveTags(headline) for headline in headline]
 headline = [RemoveEscapes(headline) for headline in headline]
 return headline 

def GetMainText(html):
 """Find maintext in html"""
 maintext = re.findall(r'<text>(.*?)</text>', html)
 maintext = [RemoveTags(maintext) for maintext in maintext]
 maintext = [RemoveEscapes(maintext) for maintext in maintext]
 maintext = [' '.join(maintext.split()) for maintext in maintext]
 return maintext

link = link
html = ReadFromLink(link)

ArticlesDict = 
 "docid": GetDocID(html), 
 "raw_article": ArticleRaw(html), 
 "headline": GetHeadline(html), 
 "maintext": GetMainText(html)


def CountFeatures(text):
 documents = ArticlesDict['maintext']
 # Stem first?
 vector = CountVectorizer()
 x = vector.fit_transform(documents)
 df_features = pd.DataFrame(x.toarray(), columns = vector.get_feature_names())
 return df_features

df_features = CountFeatures(df_articles['maintext'])

edited Mar 2 at 19:15

200_success

123k14142399

asked Mar 2 at 19:09

Daniel Hansen

384

The script is working and does what it is supposed to, however, I cannot help but feel that I'm missing a lot in terms of performance.

import re
import pandas as pd
from urllib.request import urlopen as uReq
from sklearn.feature_extraction.text import CountVectorizer

TAG_RE = re.compile(r'<[^>]+>')
def RemoveTags(text):
 """Remove all html tags"""
 return TAG_RE.sub('', text)

ESCAPES_RE = re.compile(r'\.')
def RemoveEscapes(text):
 """Remove extra escape characters from encoding"""
 return ESCAPES_RE.sub('', text)

def ReadFromLink(link):
 """Read html from link and return raw html"""
 with uReq(link) as response:
 html = response.read()#
 html = str(html).lower()
 return html.lower()

def ArticleRaw(html):
 """Find articles in html"""
 article = re.findall(r'<doc>.*?</doc>', html)
 return article

def GetDocID(html):
 """Find document ids in html"""
 docid = re.findall(r'<docid>(.*?)</docid>', html)
 docid = [docid.strip() for docid in docid]
 docid = [int(docid) for docid in docid] 
 return docid

def GetHeadline(html):
 """Find headlines in html"""
 headline = re.findall(r'<headline>(.*?)</headline>', html)
 headline = [RemoveTags(headline) for headline in headline]
 headline = [RemoveEscapes(headline) for headline in headline]
 return headline 

def GetMainText(html):
 """Find maintext in html"""
 maintext = re.findall(r'<text>(.*?)</text>', html)
 maintext = [RemoveTags(maintext) for maintext in maintext]
 maintext = [RemoveEscapes(maintext) for maintext in maintext]
 maintext = [' '.join(maintext.split()) for maintext in maintext]
 return maintext

link = link
html = ReadFromLink(link)

ArticlesDict = 
 "docid": GetDocID(html), 
 "raw_article": ArticleRaw(html), 
 "headline": GetHeadline(html), 
 "maintext": GetMainText(html)


def CountFeatures(text):
 documents = ArticlesDict['maintext']
 # Stem first?
 vector = CountVectorizer()
 x = vector.fit_transform(documents)
 df_features = pd.DataFrame(x.toarray(), columns = vector.get_feature_names())
 return df_features

df_features = CountFeatures(df_articles['maintext'])

edited Mar 2 at 19:15

200_success

123k14142399

asked Mar 2 at 19:09

Daniel Hansen

384

edited Mar 2 at 19:15

200_success

123k14142399

edited Mar 2 at 19:15

200_success

123k14142399

edited Mar 2 at 19:15

200_success

123k14142399

asked Mar 2 at 19:09

Daniel Hansen

384

asked Mar 2 at 19:09

Daniel Hansen

384

asked Mar 2 at 19:09

Daniel Hansen

384

add a commentÂ |Â

1 Answer
1

active

oldest

votes

up vote
5
down vote

accepted

If I may suggest, using a tool like Beautiful Soup can greatly help you get around html elements in a simple way

http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/

Here you have a very brief example on how it operates

from bs4 import BeautifulSoup
import requests

r = requests.get("http://any_url_you_want.com")

data = r.text

soup = BeautifulSoup(data)

for text in soup.find_all('text'):
 # Here you do whatever you want with text

You can adapt your methods to use the functions depending on the tags, or however you want

Check also this article, explains quite well what you can do with it and is accessible for beginners

https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3

answered Mar 3 at 0:07

A. Romeu

949313

1

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

add a commentÂ |Â

Your Answer

StackExchange.ifUsing("editor", function ()
return StackExchange.using("mathjaxEditing", function ()
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix)
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
);
);
, "mathjax-editing");

StackExchange.ifUsing("editor", function ()
StackExchange.using("externalEditor", function ()
StackExchange.using("snippets", function ()
StackExchange.snippets.init();
);
);
, "code-snippets");

StackExchange.ready(function()
var channelOptions =
tags: "".split(" "),
id: "196"
;
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function()
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled)
StackExchange.using("snippets", function()
createEditor();
);

else
createEditor();

);

function createEditor()
StackExchange.prepareEditor(
heartbeatType: 'answer',
convertImagesToLinks: false,
noModals: false,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
);

);

draft saved

draft discarded

StackExchange.ready(
function ()
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f188692%2fpreprocess-html-input-and-generate-word-count%23new-answer', 'question_page');

);

Post as a guest

Name

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

up vote
5
down vote

accepted

If I may suggest, using a tool like Beautiful Soup can greatly help you get around html elements in a simple way

http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/

Here you have a very brief example on how it operates

from bs4 import BeautifulSoup
import requests

r = requests.get("http://any_url_you_want.com")

data = r.text

soup = BeautifulSoup(data)

for text in soup.find_all('text'):
 # Here you do whatever you want with text

You can adapt your methods to use the functions depending on the tags, or however you want

Check also this article, explains quite well what you can do with it and is accessible for beginners

https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3

answered Mar 3 at 0:07

A. Romeu

949313

1

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

add a commentÂ |Â

up vote
5
down vote

accepted

If I may suggest, using a tool like Beautiful Soup can greatly help you get around html elements in a simple way

http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/

Here you have a very brief example on how it operates

from bs4 import BeautifulSoup
import requests

r = requests.get("http://any_url_you_want.com")

data = r.text

soup = BeautifulSoup(data)

for text in soup.find_all('text'):
 # Here you do whatever you want with text

You can adapt your methods to use the functions depending on the tags, or however you want

Check also this article, explains quite well what you can do with it and is accessible for beginners

https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3

answered Mar 3 at 0:07

A. Romeu

949313

1

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

add a commentÂ |Â

up vote
5
down vote

accepted

If I may suggest, using a tool like Beautiful Soup can greatly help you get around html elements in a simple way

http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/

Here you have a very brief example on how it operates

from bs4 import BeautifulSoup
import requests

r = requests.get("http://any_url_you_want.com")

data = r.text

soup = BeautifulSoup(data)

for text in soup.find_all('text'):
 # Here you do whatever you want with text

You can adapt your methods to use the functions depending on the tags, or however you want

Check also this article, explains quite well what you can do with it and is accessible for beginners

https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3

answered Mar 3 at 0:07

A. Romeu

949313

If I may suggest, using a tool like Beautiful Soup can greatly help you get around html elements in a simple way

http://www.pythonforbeginners.com/python-on-the-web/web-scraping-with-beautifulsoup/

Here you have a very brief example on how it operates

from bs4 import BeautifulSoup
import requests

r = requests.get("http://any_url_you_want.com")

data = r.text

soup = BeautifulSoup(data)

for text in soup.find_all('text'):
 # Here you do whatever you want with text

You can adapt your methods to use the functions depending on the tags, or however you want

Check also this article, explains quite well what you can do with it and is accessible for beginners

https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3

answered Mar 3 at 0:07

A. Romeu

949313

answered Mar 3 at 0:07

A. Romeu

949313

answered Mar 3 at 0:07

A. Romeu

949313

answered Mar 3 at 0:07

A. Romeu

949313

1

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

add a commentÂ |Â

1

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

There is now also github.com/kennethreitz/requests-html :)
â€“Â hjpotter92
Mar 3 at 2:06

I've tried it out and it works! Thank you very much for the feedback.
â€“Â Daniel Hansen
Mar 7 at 8:57

Very happy to hear :) I will also take a look at requests-html, thanks hjpotter92 for the suggestion
â€“Â A. Romeu
Mar 7 at 9:09

add a commentÂ |Â

draft saved

draft discarded

draft saved

draft discarded

Post as a guest

Name

搜尋此網誌

trjhtr