Implement HTML: lists

also not stripping text that looks like html tags
2024-08-15 12:14:25 +07:00 · 2024-08-15 12:14:25 +07:00 · 33a397baef
commit 33a397baef
parent 5273d32c30
2 changed files with 92 additions and 12 deletions
--- a/witchie/output.py
+++ b/witchie/output.py
@ -98,11 +98,12 @@ def colorize(message):
            # Reset styles at the end to prevent leaking
            yield STYLES["reset"]

-    return "".join(_generator(message))
+    return "".join(_generator(message)).replace('\\<', '<')


 def strip_tags(message):
    return re.sub(STYLE_TAG_PATTERN, "", message)
+    return message


@lru_cache(maxsize=None)
--- a/witchie/utils/init.py
+++ b/witchie/utils/init.py
@ -8,7 +8,7 @@ import warnings
 from typing import Dict
 from urllib.parse import quote, unquote, urlencode, urlparse

-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString

 from witchie.exceptions import ConsoleError

@ -32,23 +32,102 @@ def parse_html(html: str) -> BeautifulSoup:
        return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")


-def get_text(html):
+def get_text(element):
    """Converts html to text, strips all tags."""
-    text = parse_html(html).get_text()
-    return unicodedata.normalize("NFKC", text)
+    links = []
+    images = []
+    # process links
+    # if type(element) is NavigableString:
+    #     return unicodedata.normalize("NFKC", element.get_text()), links, images
+    # soup = BeautifulSoup("", "html.parser")
+    # for a in element.find_all('a'):
+    #     links.append(a.href)
+    #     new_tag = soup.new_tag('a')
+    #     new_tag.string = '<cyan>' + a.text + '</cyan>'
+    #     a.replace_with(new_tag)
+    text = element.get_text()
+    text = text.replace('<', '\\<')
+    return unicodedata.normalize("NFKC", text), links, images
+
+
+def parse_element(element):
+    """Parse top level element."""
+    links = []
+    images = []
+    texts = []
+    if element.name == 'ul':
+        texts.append('\n')
+        for li in element.find_all('li'):
+            text, link, image = get_text(li)
+            text = '- ' + text + '\n'
+            texts.append(text)
+            links.extend(link)
+            image.extend(image)
+        texts.append('\n')
+    elif element.name == 'ol':
+        texts.append('\n')
+        for i, li in enumerate(element.find_all('li')):
+            text, link, image = get_text(li)
+            text = str(i + 1) + '. ' + text + '\n'
+            texts.append(text + '\n')
+            links.extend(link)
+            image.extend(image)
+        texts.append('\n')
+    elif element.name == 'p':
+        text, link, image = get_text(element)
+        texts.append(text + '\n')
+    elif element.name == 'br':
+        texts.append('\n')
+    else:
+        text, link, image = get_text(element)
+        texts.append(text)
+        links.extend(link)
+        image.extend(image)
+    return texts, links, images


 def html_to_paragraphs(html):
-    """Attempt to convert html to plain text while keeping line breaks.
-    Returns a list of paragraphs, each being a list of lines.
+    """Parse html properly.
+
+    - bold (b/strong), italic (i/em), strikethrough (s/del), underline (u) are presented as such
+    - link (a) is presented as underlined blue and numbered reference, which is linked at bottom
+        e.g. This is <underline><blue>an example link</blue></underline>[1]
+
+        [1]: https://example.com
+    - blockquotes and code blocks are fenced with note
+    - inline images are presented as [img#n "alt text"] and linked at bottom
+    - ul lists are presented with hyphen-bullet
+    - ol lists are presented with numbers
    """
-    paragraphs = re.split("</?p[^>]*>", html)
+    # paragraphs = re.split("</?p[^>]*>", html)

-    # Convert <br>s to line breaks and remove empty paragraphs
-    paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
+    # # Convert <br>s to line breaks and remove empty paragraphs
+    # paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]

-    # Convert each line in each paragraph to plain text:
-    return [[get_text(line) for line in p] for p in paragraphs]
+    # # Convert each line in each paragraph to plain text:
+    # return [[get_text(line) for line in p] for p in paragraphs]
+    soup = parse_html(html)
+    parsed = []
+    link_refs = []
+    image_refs = []
+    for element in soup:
+        text, links, images = parse_element(element)
+        parsed.extend(text)
+        link_refs.extend(links)
+        image_refs.extend(images)
+
+    paragraphs = []
+    for text in parsed:
+        if len(paragraphs) == 0:
+            paragraphs.append(text)
+            continue
+        if paragraphs[-1].endswith('\n'):
+            paragraphs.append(text)
+        else:
+            paragraphs[-1] += text
+    # print(paragraphs)
+    # for link in link_refs:
+    return [paragraphs]


 def format_content(content):