Some of the words outputted are split when running this code. Like the word "tolerances" is split into "tole rances". I looked at the html source and it seems that's how the page was created.
There are also many other places where the word is split. How do I recombine them before writing to text?
import requests, codecs
from bs4 import BeautifulSoup
from bs4.element import Comment
path='C:\\Users\\jason\\Google Drive\\python\\'
def tag_visible(element):
if element.parent.name in ['sup']:
return False
if isinstance(element, Comment):
return False
return True
ticker = 'TSLA'
quarter = '18Q2'
mark1= 'ITEM 1A'
mark2= 'UNREGISTERED SALES'
url_new='https://www.sec.gov/Archives/edgar/data/1318605/000156459018019254/tsla-10q_20180630.htm'
def get_text(url,mark1,mark2):
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
for hr in soup.select('hr'):
hr.find_previous('p').extract()
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
text=u" ".join(t.strip() for t in visible_texts)
return text[text.find(mark1): text.find(mark2)]
text = get_text(url_new,mark1,mark2)
file=codecs.open(path + "test.txt", 'w', encoding='utf8')
file.write (text)
file.close()
from beautifulsoup how to recombine words
No comments:
Post a Comment