Implement HTML: lists

also not stripping text that looks like html tags
2024-08-15 12:14:25 +07:00 · 2024-08-15 12:14:25 +07:00 · 33a397baef
commit 33a397baef
parent 5273d32c30
2 changed files with 92 additions and 12 deletions
--- a/witchie/output.py
+++ b/witchie/output.py
@ -98,11 +98,12 @@ def colorize(message):
            # Reset styles at the end to prevent leaking
            yield STYLES["reset"]
-    return "".join(_generator(message))
+    return "".join(_generator(message)).replace('\\<', '<')
 def strip_tags(message):
    return re.sub(STYLE_TAG_PATTERN, "", message)
    return message
@lru_cache(maxsize=None)
--- a/witchie/utils/init.py
+++ b/witchie/utils/init.py
@ -8,7 +8,7 @@ import warnings
 from typing import Dict
 from urllib.parse import quote, unquote, urlencode, urlparse
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 from witchie.exceptions import ConsoleError
@ -32,23 +32,102 @@ def parse_html(html: str) -> BeautifulSoup:
        return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
-def get_text(html):
+def get_text(element):
    """Converts html to text, strips all tags."""
-    text = parse_html(html).get_text()
+    links = []
-    return unicodedata.normalize("NFKC", text)
+    images = []
    # process links
    # if type(element) is NavigableString:
    #     return unicodedata.normalize("NFKC", element.get_text()), links, images
    # soup = BeautifulSoup("", "html.parser")
    # for a in element.find_all('a'):
    #     links.append(a.href)
    #     new_tag = soup.new_tag('a')
    #     new_tag.string = '<cyan>' + a.text + '</cyan>'
    #     a.replace_with(new_tag)
    text = element.get_text()
    text = text.replace('<', '\\<')
    return unicodedata.normalize("NFKC", text), links, images
 def parse_element(element):
    """Parse top level element."""
    links = []
    images = []
    texts = []
    if element.name == 'ul':
        texts.append('\n')
        for li in element.find_all('li'):
            text, link, image = get_text(li)
            text = '- ' + text + '\n'
            texts.append(text)
            links.extend(link)
            image.extend(image)
        texts.append('\n')
    elif element.name == 'ol':
        texts.append('\n')
        for i, li in enumerate(element.find_all('li')):
            text, link, image = get_text(li)
            text = str(i + 1) + '. ' + text + '\n'
            texts.append(text + '\n')
            links.extend(link)
            image.extend(image)
        texts.append('\n')
    elif element.name == 'p':
        text, link, image = get_text(element)
        texts.append(text + '\n')
    elif element.name == 'br':
        texts.append('\n')
    else:
        text, link, image = get_text(element)
        texts.append(text)
        links.extend(link)
        image.extend(image)
    return texts, links, images
 def html_to_paragraphs(html):
-    """Attempt to convert html to plain text while keeping line breaks.
+    """Parse html properly.
-    Returns a list of paragraphs, each being a list of lines.
+
    - bold (b/strong), italic (i/em), strikethrough (s/del), underline (u) are presented as such
    - link (a) is presented as underlined blue and numbered reference, which is linked at bottom
        e.g. This is <underline><blue>an example link</blue></underline>[1]
        [1]: https://example.com
    - blockquotes and code blocks are fenced with note
    - inline images are presented as [img#n "alt text"] and linked at bottom
    - ul lists are presented with hyphen-bullet
    - ol lists are presented with numbers
    """
-    paragraphs = re.split("</?p[^>]*>", html)
+    # paragraphs = re.split("</?p[^>]*>", html)
-    # Convert <br>s to line breaks and remove empty paragraphs
+    # # Convert <br>s to line breaks and remove empty paragraphs
-    paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
+    # paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
-    # Convert each line in each paragraph to plain text:
+    # # Convert each line in each paragraph to plain text:
-    return [[get_text(line) for line in p] for p in paragraphs]
+    # return [[get_text(line) for line in p] for p in paragraphs]
    soup = parse_html(html)
    parsed = []
    link_refs = []
    image_refs = []
    for element in soup:
        text, links, images = parse_element(element)
        parsed.extend(text)
        link_refs.extend(links)
        image_refs.extend(images)
    paragraphs = []
    for text in parsed:
        if len(paragraphs) == 0:
            paragraphs.append(text)
            continue
        if paragraphs[-1].endswith('\n'):
            paragraphs.append(text)
        else:
            paragraphs[-1] += text
    # print(paragraphs)
    # for link in link_refs:
    return [paragraphs]
 def format_content(content):