Implement HTML: lists

also not stripping text that looks like html tags
This commit is contained in:
Huy Ngo 2024-08-15 12:14:25 +07:00
parent 5273d32c30
commit 33a397baef
2 changed files with 92 additions and 12 deletions

View file

@ -98,11 +98,12 @@ def colorize(message):
# Reset styles at the end to prevent leaking # Reset styles at the end to prevent leaking
yield STYLES["reset"] yield STYLES["reset"]
return "".join(_generator(message)) return "".join(_generator(message)).replace('\\<', '<')
def strip_tags(message): def strip_tags(message):
return re.sub(STYLE_TAG_PATTERN, "", message) return re.sub(STYLE_TAG_PATTERN, "", message)
return message
@lru_cache(maxsize=None) @lru_cache(maxsize=None)

View file

@ -8,7 +8,7 @@ import warnings
from typing import Dict from typing import Dict
from urllib.parse import quote, unquote, urlencode, urlparse from urllib.parse import quote, unquote, urlencode, urlparse
from bs4 import BeautifulSoup from bs4 import BeautifulSoup, NavigableString
from witchie.exceptions import ConsoleError from witchie.exceptions import ConsoleError
@ -32,23 +32,102 @@ def parse_html(html: str) -> BeautifulSoup:
return BeautifulSoup(html.replace("&apos;", "'"), "html.parser") return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
def get_text(html): def get_text(element):
"""Converts html to text, strips all tags.""" """Converts html to text, strips all tags."""
text = parse_html(html).get_text() links = []
return unicodedata.normalize("NFKC", text) images = []
# process links
# if type(element) is NavigableString:
# return unicodedata.normalize("NFKC", element.get_text()), links, images
# soup = BeautifulSoup("", "html.parser")
# for a in element.find_all('a'):
# links.append(a.href)
# new_tag = soup.new_tag('a')
# new_tag.string = '<cyan>' + a.text + '</cyan>'
# a.replace_with(new_tag)
text = element.get_text()
text = text.replace('<', '\\<')
return unicodedata.normalize("NFKC", text), links, images
def parse_element(element):
"""Parse top level element."""
links = []
images = []
texts = []
if element.name == 'ul':
texts.append('\n')
for li in element.find_all('li'):
text, link, image = get_text(li)
text = '- ' + text + '\n'
texts.append(text)
links.extend(link)
image.extend(image)
texts.append('\n')
elif element.name == 'ol':
texts.append('\n')
for i, li in enumerate(element.find_all('li')):
text, link, image = get_text(li)
text = str(i + 1) + '. ' + text + '\n'
texts.append(text + '\n')
links.extend(link)
image.extend(image)
texts.append('\n')
elif element.name == 'p':
text, link, image = get_text(element)
texts.append(text + '\n')
elif element.name == 'br':
texts.append('\n')
else:
text, link, image = get_text(element)
texts.append(text)
links.extend(link)
image.extend(image)
return texts, links, images
def html_to_paragraphs(html): def html_to_paragraphs(html):
"""Attempt to convert html to plain text while keeping line breaks. """Parse html properly.
Returns a list of paragraphs, each being a list of lines.
- bold (b/strong), italic (i/em), strikethrough (s/del), underline (u) are presented as such
- link (a) is presented as underlined blue and numbered reference, which is linked at bottom
e.g. This is <underline><blue>an example link</blue></underline>[1]
[1]: https://example.com
- blockquotes and code blocks are fenced with note
- inline images are presented as [img#n "alt text"] and linked at bottom
- ul lists are presented with hyphen-bullet
- ol lists are presented with numbers
""" """
paragraphs = re.split("</?p[^>]*>", html) # paragraphs = re.split("</?p[^>]*>", html)
# Convert <br>s to line breaks and remove empty paragraphs # # Convert <br>s to line breaks and remove empty paragraphs
paragraphs = [re.split("<br */?>", p) for p in paragraphs if p] # paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# Convert each line in each paragraph to plain text: # # Convert each line in each paragraph to plain text:
return [[get_text(line) for line in p] for p in paragraphs] # return [[get_text(line) for line in p] for p in paragraphs]
soup = parse_html(html)
parsed = []
link_refs = []
image_refs = []
for element in soup:
text, links, images = parse_element(element)
parsed.extend(text)
link_refs.extend(links)
image_refs.extend(images)
paragraphs = []
for text in parsed:
if len(paragraphs) == 0:
paragraphs.append(text)
continue
if paragraphs[-1].endswith('\n'):
paragraphs.append(text)
else:
paragraphs[-1] += text
# print(paragraphs)
# for link in link_refs:
return [paragraphs]
def format_content(content): def format_content(content):