Implement nested list

2024-08-15 15:40:26 +07:00 · 2024-08-15 15:40:26 +07:00 · e72be437e6
commit e72be437e6
parent 4adaf21861
1 changed files with 29 additions and 49 deletions
--- a/witchie/utils/init.py
+++ b/witchie/utils/init.py
@ -41,7 +41,7 @@ def get_text(element, links, images):
        text = element.text
        if (element['href'] != element.text
                and 'mention' not in element.get('class', [])
-                and element.get('rel') != 'tag'):
+                and 'tag' not in element.get('rel')):
            links.append(element['href'])
            text = f'{text}[{len(links)}]'
        return f'<cyan>{text}</cyan>'
@ -50,52 +50,47 @@ def get_text(element, links, images):
        images.append(element['src'])
        text = f'{text}[{len(images)}]'
        return f'<cyan>{text}</cyan>'
-    for a in element.find_all('a'):
-        soup = BeautifulSoup("", "html.parser")
-        if a['href'] != a.text and 'mention' not in a.get('class', []) and a.get('rel') != 'tag':
-            links.append(a.href)
-        new_tag = soup.new_tag('a')
-        new_tag.string = '&lt;cyan>' + a.text + '&lt;/cyan>'
-        a.replace_with(new_tag)
    text = element.get_text()
    text = text.replace('<', '\\<')
    text = text.replace('&lt;', '<')
    return unicodedata.normalize("NFKC", text)


-def parse_element(element, links, images):
+def parse_element(element, links, images, depth=0, list_type=None):
    """Parse top level element."""
-    texts = []
+    texts = ''
    if element.name == 'ul':
-        texts.append('\n')
-        for li in element.find_all('li'):
-            text = get_text(li, links, images)
-            text = '- ' + text + '\n'
-            texts.append(text)
-        texts.append('\n')
+        for li in element.children:
+            text = parse_element(li, links, images, depth, '- ')
+            texts += text
    elif element.name == 'ol':
-        texts.append('\n')
-        for i, li in enumerate(element.find_all('li')):
-            text = get_text(li, links, images)
-            text = str(i + 1) + '. ' + text + '\n'
-            texts.append(text + '\n')
-        texts.append('\n')
+        for i, li in enumerate(element.children):
+            text = parse_element(li, links, images, depth, str(i + 1) + '. ')
+            texts += text
+    elif element.name == 'li':
+        text = '<dim>' + '··' * depth + '</dim>' + (list_type or '')
+        for child in element.children:
+            if child.name in ('ul', 'ol'):
+                text += parse_element(child, links, images, depth + 1)
+            else:
+                text += parse_element(child, links, images, depth) + '\n'
+        texts += text
    elif element.name == 'blockquote':
-        texts.append('<yellow>quote:</yellow>\n')
+        texts += '<yellow>quote:</yellow>\n'
        for child in element.children:
            text = parse_element(child, links, images)
-            texts.extend(text)
-        texts.append('<yellow>endquote</yellow>\n')
+            texts += text
+        texts += '<yellow>endquote</yellow>\n'
    elif element.name == 'p':
        for child in element.children:
            text = parse_element(child, links, images)
-            texts.extend(text)
-        texts.append('\n')
+            texts += text
+        texts += '\n'
    elif element.name == 'br':
-        texts.append('\n')
+        texts += '\n'
    else:
        text = get_text(element, links, images)
-        texts.append(text)
+        texts += text
    return texts


@ -112,32 +107,17 @@ def html_to_paragraphs(html):
    - ul lists are presented with hyphen-bullet
    - ol lists are presented with numbers
    """
-    # paragraphs = re.split("</?p[^>]*>", html)
-
-    # # Convert <br>s to line breaks and remove empty paragraphs
-    # paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
-
-    # # Convert each line in each paragraph to plain text:
-    # return [[get_text(line) for line in p] for p in paragraphs]
    soup = parse_html(html)
-    parsed = []
+    parsed = ''
    links = []
    images = []
    for element in soup:
        text = parse_element(element, links, images)
-        parsed.extend(text)
+        parsed += text

-    paragraphs = []
-    for text in parsed:
-        if len(paragraphs) == 0:
-            paragraphs.append(text)
-            continue
-        if paragraphs[-1].endswith('\n'):
-            paragraphs.append(text)
-        else:
-            paragraphs[-1] += text
-    if links or images:
-        paragraphs.append('')
+    if parsed[-1] == '\n':
+        parsed = parsed[:-1]
+    paragraphs = parsed.split('\n')
    for i, link in enumerate(links):
        paragraphs.append(f'[{i + 1}]: {link}')
    for i, link in enumerate(images):