WIP: try python html parser for proper parsing

This commit is contained in:
Huy Ngo 2024-08-16 18:36:02 +07:00
parent 0ba5f2fc7a
commit 7ab79be936

View file

@ -5,6 +5,8 @@ import subprocess
import tempfile import tempfile
import unicodedata import unicodedata
import warnings import warnings
from collections import deque
from html.parser import HTMLParser
from typing import Dict from typing import Dict
from urllib.parse import quote, unquote, urlencode, urlparse from urllib.parse import quote, unquote, urlencode, urlparse
@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
return texts return texts
class HTMLTextParser(HTMLParser):
"""Parse HTML to text"""
def __init__(self):
super().__init__()
self.tag_stack = deque()
self.links = []
self.image_links = []
self.output = ''
def reset(self):
"""Reset the instance and clear all processed data."""
super().reset()
self.tag_stack = deque()
self.output = ''
self.links = []
self.image_links = []
def handle_starttag(self, tag, attrs):
if tag in 'img', 'br', 'hr', 'wbr':
# just in case they're not closed
self.handle_startendtag(self, tag, attrs)
return
self.tag_stack.append(tag)
def handle_endtag(self, tag):
self.tag_stack.pop()
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.output += '\n'
if tag == 'hr':
self.output += '\n─────────────────────\n'
if tag == 'img':
pass
def html_to_paragraphs(html): def html_to_paragraphs(html):
"""Parse html properly. """Parse html properly.
@ -166,7 +204,10 @@ def html_to_paragraphs(html):
parsed = '' parsed = ''
links = [] links = []
images = [] images = []
parser = HTMLTextParser()
for element in soup: for element in soup:
element_text = str(element)
print(parser.feed(element_text))
text = parse_element(element, links, images) text = parse_element(element, links, images)
parsed += text parsed += text