WIP: try python html parser for proper parsing

2024-08-16 18:36:02 +07:00 · 2024-08-16 18:36:02 +07:00 · 7ab79be936
commit 7ab79be936
parent 0ba5f2fc7a
1 changed files with 41 additions and 0 deletions
--- a/witchie/utils/init.py
+++ b/witchie/utils/init.py
@ -5,6 +5,8 @@ import subprocess
 import tempfile
 import unicodedata
 import warnings
+from collections import deque
+from html.parser import HTMLParser
 from typing import Dict
 from urllib.parse import quote, unquote, urlencode, urlparse

@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
    return texts


+class HTMLTextParser(HTMLParser):
+    """Parse HTML to text"""
+    def __init__(self):
+        super().__init__()
+        self.tag_stack = deque()
+        self.links = []
+        self.image_links = []
+        self.output = ''
+
+    def reset(self):
+        """Reset the instance and clear all processed data."""
+        super().reset()
+        self.tag_stack = deque()
+        self.output = ''
+        self.links = []
+        self.image_links = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag in 'img', 'br', 'hr', 'wbr':
+            # just in case they're not closed
+            self.handle_startendtag(self, tag, attrs)
+            return
+        self.tag_stack.append(tag)
+
+    def handle_endtag(self, tag):
+        self.tag_stack.pop()
+
+    def handle_startendtag(self, tag, attrs):
+        if tag == 'br':
+            self.output += '\n'
+        if tag == 'hr':
+            self.output += '\n─────────────────────\n'
+        if tag == 'img':
+            pass
+
+
 def html_to_paragraphs(html):
    """Parse html properly.

@ -166,7 +204,10 @@ def html_to_paragraphs(html):
    parsed = ''
    links = []
    images = []
+    parser = HTMLTextParser()
    for element in soup:
+        element_text = str(element)
+        print(parser.feed(element_text))
        text = parse_element(element, links, images)
        parsed += text