# -*- coding: utf-8 -*- from __future__ import unicode_literals from __future__ import print_function import re from bs4 import BeautifulSoup def get_text(html): """Converts html to text, strips all tags.""" return BeautifulSoup(html, "html.parser").get_text().replace(''', "'") def parse_html(html): """Attempt to convert html to plain text while keeping line breaks. Returns a list of paragraphs, each being a list of lines. """ paragraphs = re.split("]*>", html) # Convert
s to line breaks and remove empty paragraphs paragraphs = [re.split("
", p) for p in paragraphs if p] # Convert each line in each paragraph to plain text: return [[get_text(l) for l in p] for p in paragraphs] def format_content(content): """Given a Status contents in HTML, converts it into lines of plain text. Returns a generator yielding lines of content. """ paragraphs = parse_html(content) first = True for paragraph in paragraphs: if not first: yield "" for line in paragraph: yield line first = False