Implement HTML: lists

also not stripping text that looks like html tags
This commit is contained in:
Huy Ngo 2024-08-15 12:14:25 +07:00
parent 5273d32c30
commit 33a397baef
2 changed files with 92 additions and 12 deletions

View file

@ -98,11 +98,12 @@ def colorize(message):
# Reset styles at the end to prevent leaking
yield STYLES["reset"]
return "".join(_generator(message))
return "".join(_generator(message)).replace('\\<', '<')
def strip_tags(message):
return re.sub(STYLE_TAG_PATTERN, "", message)
return message
@lru_cache(maxsize=None)

View file

@ -8,7 +8,7 @@ import warnings
from typing import Dict
from urllib.parse import quote, unquote, urlencode, urlparse
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, NavigableString
from witchie.exceptions import ConsoleError
@ -32,23 +32,102 @@ def parse_html(html: str) -> BeautifulSoup:
return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
def get_text(html):
def get_text(element):
"""Converts html to text, strips all tags."""
text = parse_html(html).get_text()
return unicodedata.normalize("NFKC", text)
links = []
images = []
# process links
# if type(element) is NavigableString:
# return unicodedata.normalize("NFKC", element.get_text()), links, images
# soup = BeautifulSoup("", "html.parser")
# for a in element.find_all('a'):
# links.append(a.href)
# new_tag = soup.new_tag('a')
# new_tag.string = '<cyan>' + a.text + '</cyan>'
# a.replace_with(new_tag)
text = element.get_text()
text = text.replace('<', '\\<')
return unicodedata.normalize("NFKC", text), links, images
def parse_element(element):
"""Parse top level element."""
links = []
images = []
texts = []
if element.name == 'ul':
texts.append('\n')
for li in element.find_all('li'):
text, link, image = get_text(li)
text = '- ' + text + '\n'
texts.append(text)
links.extend(link)
image.extend(image)
texts.append('\n')
elif element.name == 'ol':
texts.append('\n')
for i, li in enumerate(element.find_all('li')):
text, link, image = get_text(li)
text = str(i + 1) + '. ' + text + '\n'
texts.append(text + '\n')
links.extend(link)
image.extend(image)
texts.append('\n')
elif element.name == 'p':
text, link, image = get_text(element)
texts.append(text + '\n')
elif element.name == 'br':
texts.append('\n')
else:
text, link, image = get_text(element)
texts.append(text)
links.extend(link)
image.extend(image)
return texts, links, images
def html_to_paragraphs(html):
"""Attempt to convert html to plain text while keeping line breaks.
Returns a list of paragraphs, each being a list of lines.
"""Parse html properly.
- bold (b/strong), italic (i/em), strikethrough (s/del), underline (u) are presented as such
- link (a) is presented as underlined blue and numbered reference, which is linked at bottom
e.g. This is <underline><blue>an example link</blue></underline>[1]
[1]: https://example.com
- blockquotes and code blocks are fenced with note
- inline images are presented as [img#n "alt text"] and linked at bottom
- ul lists are presented with hyphen-bullet
- ol lists are presented with numbers
"""
paragraphs = re.split("</?p[^>]*>", html)
# paragraphs = re.split("</?p[^>]*>", html)
# Convert <br>s to line breaks and remove empty paragraphs
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# # Convert <br>s to line breaks and remove empty paragraphs
# paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# Convert each line in each paragraph to plain text:
return [[get_text(line) for line in p] for p in paragraphs]
# # Convert each line in each paragraph to plain text:
# return [[get_text(line) for line in p] for p in paragraphs]
soup = parse_html(html)
parsed = []
link_refs = []
image_refs = []
for element in soup:
text, links, images = parse_element(element)
parsed.extend(text)
link_refs.extend(links)
image_refs.extend(images)
paragraphs = []
for text in parsed:
if len(paragraphs) == 0:
paragraphs.append(text)
continue
if paragraphs[-1].endswith('\n'):
paragraphs.append(text)
else:
paragraphs[-1] += text
# print(paragraphs)
# for link in link_refs:
return [paragraphs]
def format_content(content):