Implement HTML: lists
also not stripping text that looks like html tags
This commit is contained in:
parent
5273d32c30
commit
33a397baef
2 changed files with 92 additions and 12 deletions
|
@ -98,11 +98,12 @@ def colorize(message):
|
||||||
# Reset styles at the end to prevent leaking
|
# Reset styles at the end to prevent leaking
|
||||||
yield STYLES["reset"]
|
yield STYLES["reset"]
|
||||||
|
|
||||||
return "".join(_generator(message))
|
return "".join(_generator(message)).replace('\\<', '<')
|
||||||
|
|
||||||
|
|
||||||
def strip_tags(message):
|
def strip_tags(message):
|
||||||
return re.sub(STYLE_TAG_PATTERN, "", message)
|
return re.sub(STYLE_TAG_PATTERN, "", message)
|
||||||
|
return message
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
|
|
|
@ -8,7 +8,7 @@ import warnings
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from urllib.parse import quote, unquote, urlencode, urlparse
|
from urllib.parse import quote, unquote, urlencode, urlparse
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
|
||||||
from witchie.exceptions import ConsoleError
|
from witchie.exceptions import ConsoleError
|
||||||
|
|
||||||
|
@ -32,23 +32,102 @@ def parse_html(html: str) -> BeautifulSoup:
|
||||||
return BeautifulSoup(html.replace("'", "'"), "html.parser")
|
return BeautifulSoup(html.replace("'", "'"), "html.parser")
|
||||||
|
|
||||||
|
|
||||||
def get_text(html):
|
def get_text(element):
|
||||||
"""Converts html to text, strips all tags."""
|
"""Converts html to text, strips all tags."""
|
||||||
text = parse_html(html).get_text()
|
links = []
|
||||||
return unicodedata.normalize("NFKC", text)
|
images = []
|
||||||
|
# process links
|
||||||
|
# if type(element) is NavigableString:
|
||||||
|
# return unicodedata.normalize("NFKC", element.get_text()), links, images
|
||||||
|
# soup = BeautifulSoup("", "html.parser")
|
||||||
|
# for a in element.find_all('a'):
|
||||||
|
# links.append(a.href)
|
||||||
|
# new_tag = soup.new_tag('a')
|
||||||
|
# new_tag.string = '<cyan>' + a.text + '</cyan>'
|
||||||
|
# a.replace_with(new_tag)
|
||||||
|
text = element.get_text()
|
||||||
|
text = text.replace('<', '\\<')
|
||||||
|
return unicodedata.normalize("NFKC", text), links, images
|
||||||
|
|
||||||
|
|
||||||
|
def parse_element(element):
|
||||||
|
"""Parse top level element."""
|
||||||
|
links = []
|
||||||
|
images = []
|
||||||
|
texts = []
|
||||||
|
if element.name == 'ul':
|
||||||
|
texts.append('\n')
|
||||||
|
for li in element.find_all('li'):
|
||||||
|
text, link, image = get_text(li)
|
||||||
|
text = '- ' + text + '\n'
|
||||||
|
texts.append(text)
|
||||||
|
links.extend(link)
|
||||||
|
image.extend(image)
|
||||||
|
texts.append('\n')
|
||||||
|
elif element.name == 'ol':
|
||||||
|
texts.append('\n')
|
||||||
|
for i, li in enumerate(element.find_all('li')):
|
||||||
|
text, link, image = get_text(li)
|
||||||
|
text = str(i + 1) + '. ' + text + '\n'
|
||||||
|
texts.append(text + '\n')
|
||||||
|
links.extend(link)
|
||||||
|
image.extend(image)
|
||||||
|
texts.append('\n')
|
||||||
|
elif element.name == 'p':
|
||||||
|
text, link, image = get_text(element)
|
||||||
|
texts.append(text + '\n')
|
||||||
|
elif element.name == 'br':
|
||||||
|
texts.append('\n')
|
||||||
|
else:
|
||||||
|
text, link, image = get_text(element)
|
||||||
|
texts.append(text)
|
||||||
|
links.extend(link)
|
||||||
|
image.extend(image)
|
||||||
|
return texts, links, images
|
||||||
|
|
||||||
|
|
||||||
def html_to_paragraphs(html):
|
def html_to_paragraphs(html):
|
||||||
"""Attempt to convert html to plain text while keeping line breaks.
|
"""Parse html properly.
|
||||||
Returns a list of paragraphs, each being a list of lines.
|
|
||||||
|
- bold (b/strong), italic (i/em), strikethrough (s/del), underline (u) are presented as such
|
||||||
|
- link (a) is presented as underlined blue and numbered reference, which is linked at bottom
|
||||||
|
e.g. This is <underline><blue>an example link</blue></underline>[1]
|
||||||
|
|
||||||
|
[1]: https://example.com
|
||||||
|
- blockquotes and code blocks are fenced with note
|
||||||
|
- inline images are presented as [img#n "alt text"] and linked at bottom
|
||||||
|
- ul lists are presented with hyphen-bullet
|
||||||
|
- ol lists are presented with numbers
|
||||||
"""
|
"""
|
||||||
paragraphs = re.split("</?p[^>]*>", html)
|
# paragraphs = re.split("</?p[^>]*>", html)
|
||||||
|
|
||||||
# Convert <br>s to line breaks and remove empty paragraphs
|
# # Convert <br>s to line breaks and remove empty paragraphs
|
||||||
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
|
# paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
|
||||||
|
|
||||||
# Convert each line in each paragraph to plain text:
|
# # Convert each line in each paragraph to plain text:
|
||||||
return [[get_text(line) for line in p] for p in paragraphs]
|
# return [[get_text(line) for line in p] for p in paragraphs]
|
||||||
|
soup = parse_html(html)
|
||||||
|
parsed = []
|
||||||
|
link_refs = []
|
||||||
|
image_refs = []
|
||||||
|
for element in soup:
|
||||||
|
text, links, images = parse_element(element)
|
||||||
|
parsed.extend(text)
|
||||||
|
link_refs.extend(links)
|
||||||
|
image_refs.extend(images)
|
||||||
|
|
||||||
|
paragraphs = []
|
||||||
|
for text in parsed:
|
||||||
|
if len(paragraphs) == 0:
|
||||||
|
paragraphs.append(text)
|
||||||
|
continue
|
||||||
|
if paragraphs[-1].endswith('\n'):
|
||||||
|
paragraphs.append(text)
|
||||||
|
else:
|
||||||
|
paragraphs[-1] += text
|
||||||
|
# print(paragraphs)
|
||||||
|
# for link in link_refs:
|
||||||
|
return [paragraphs]
|
||||||
|
|
||||||
|
|
||||||
def format_content(content):
|
def format_content(content):
|
||||||
|
|
Loading…
Reference in a new issue