WIP: try python html parser for proper parsing

2024-08-16 18:36:02 +07:00 · 2024-08-16 18:36:02 +07:00 · 7ab79be936
commit 7ab79be936
parent 0ba5f2fc7a
1 changed files with 41 additions and 0 deletions
--- a/witchie/utils/init.py
+++ b/witchie/utils/init.py
@ -5,6 +5,8 @@ import subprocess
 import tempfile
 import unicodedata
 import warnings
 from collections import deque
 from html.parser import HTMLParser
 from typing import Dict
 from urllib.parse import quote, unquote, urlencode, urlparse
@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
    return texts
 class HTMLTextParser(HTMLParser):
    """Parse HTML to text"""
    def __init__(self):
        super().__init__()
        self.tag_stack = deque()
        self.links = []
        self.image_links = []
        self.output = ''
    def reset(self):
        """Reset the instance and clear all processed data."""
        super().reset()
        self.tag_stack = deque()
        self.output = ''
        self.links = []
        self.image_links = []
    def handle_starttag(self, tag, attrs):
        if tag in 'img', 'br', 'hr', 'wbr':
            # just in case they're not closed
            self.handle_startendtag(self, tag, attrs)
            return
        self.tag_stack.append(tag)
    def handle_endtag(self, tag):
        self.tag_stack.pop()
    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self.output += '\n'
        if tag == 'hr':
            self.output += '\n─────────────────────\n'
        if tag == 'img':
            pass
 def html_to_paragraphs(html):
    """Parse html properly.
@ -166,7 +204,10 @@ def html_to_paragraphs(html):
    parsed = ''
    links = []
    images = []
    parser = HTMLTextParser()
    for element in soup:
        element_text = str(element)
        print(parser.feed(element_text))
        text = parse_element(element, links, images)
        parsed += text