WIP: try python html parser for proper parsing
This commit is contained in:
parent
0ba5f2fc7a
commit
7ab79be936
1 changed files with 41 additions and 0 deletions
|
@ -5,6 +5,8 @@ import subprocess
|
|||
import tempfile
|
||||
import unicodedata
|
||||
import warnings
|
||||
from collections import deque
|
||||
from html.parser import HTMLParser
|
||||
from typing import Dict
|
||||
from urllib.parse import quote, unquote, urlencode, urlparse
|
||||
|
||||
|
@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
|
|||
return texts
|
||||
|
||||
|
||||
class HTMLTextParser(HTMLParser):
|
||||
"""Parse HTML to text"""
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tag_stack = deque()
|
||||
self.links = []
|
||||
self.image_links = []
|
||||
self.output = ''
|
||||
|
||||
def reset(self):
|
||||
"""Reset the instance and clear all processed data."""
|
||||
super().reset()
|
||||
self.tag_stack = deque()
|
||||
self.output = ''
|
||||
self.links = []
|
||||
self.image_links = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag in 'img', 'br', 'hr', 'wbr':
|
||||
# just in case they're not closed
|
||||
self.handle_startendtag(self, tag, attrs)
|
||||
return
|
||||
self.tag_stack.append(tag)
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
self.tag_stack.pop()
|
||||
|
||||
def handle_startendtag(self, tag, attrs):
|
||||
if tag == 'br':
|
||||
self.output += '\n'
|
||||
if tag == 'hr':
|
||||
self.output += '\n─────────────────────\n'
|
||||
if tag == 'img':
|
||||
pass
|
||||
|
||||
|
||||
def html_to_paragraphs(html):
|
||||
"""Parse html properly.
|
||||
|
||||
|
@ -166,7 +204,10 @@ def html_to_paragraphs(html):
|
|||
parsed = ''
|
||||
links = []
|
||||
images = []
|
||||
parser = HTMLTextParser()
|
||||
for element in soup:
|
||||
element_text = str(element)
|
||||
print(parser.feed(element_text))
|
||||
text = parse_element(element, links, images)
|
||||
parsed += text
|
||||
|
||||
|
|
Loading…
Reference in a new issue