WIP: try python html parser for proper parsing

This commit is contained in:
Huy Ngo 2024-08-16 18:36:02 +07:00
parent 0ba5f2fc7a
commit 7ab79be936

View file

@ -5,6 +5,8 @@ import subprocess
import tempfile
import unicodedata
import warnings
from collections import deque
from html.parser import HTMLParser
from typing import Dict
from urllib.parse import quote, unquote, urlencode, urlparse
@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
return texts
class HTMLTextParser(HTMLParser):
"""Parse HTML to text"""
def __init__(self):
super().__init__()
self.tag_stack = deque()
self.links = []
self.image_links = []
self.output = ''
def reset(self):
"""Reset the instance and clear all processed data."""
super().reset()
self.tag_stack = deque()
self.output = ''
self.links = []
self.image_links = []
def handle_starttag(self, tag, attrs):
if tag in 'img', 'br', 'hr', 'wbr':
# just in case they're not closed
self.handle_startendtag(self, tag, attrs)
return
self.tag_stack.append(tag)
def handle_endtag(self, tag):
self.tag_stack.pop()
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.output += '\n'
if tag == 'hr':
self.output += '\n─────────────────────\n'
if tag == 'img':
pass
def html_to_paragraphs(html):
"""Parse html properly.
@ -166,7 +204,10 @@ def html_to_paragraphs(html):
parsed = ''
links = []
images = []
parser = HTMLTextParser()
for element in soup:
element_text = str(element)
print(parser.feed(element_text))
text = parse_element(element, links, images)
parsed += text