WIP: try python html parser for proper parsing
This commit is contained in:
parent
0ba5f2fc7a
commit
7ab79be936
1 changed files with 41 additions and 0 deletions
|
@ -5,6 +5,8 @@ import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
import unicodedata
|
import unicodedata
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections import deque
|
||||||
|
from html.parser import HTMLParser
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
from urllib.parse import quote, unquote, urlencode, urlparse
|
from urllib.parse import quote, unquote, urlencode, urlparse
|
||||||
|
|
||||||
|
@ -149,6 +151,42 @@ def parse_element(element, links, images, depth=0, list_type=None):
|
||||||
return texts
|
return texts
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLTextParser(HTMLParser):
|
||||||
|
"""Parse HTML to text"""
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.tag_stack = deque()
|
||||||
|
self.links = []
|
||||||
|
self.image_links = []
|
||||||
|
self.output = ''
|
||||||
|
|
||||||
|
def reset(self):
|
||||||
|
"""Reset the instance and clear all processed data."""
|
||||||
|
super().reset()
|
||||||
|
self.tag_stack = deque()
|
||||||
|
self.output = ''
|
||||||
|
self.links = []
|
||||||
|
self.image_links = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
if tag in 'img', 'br', 'hr', 'wbr':
|
||||||
|
# just in case they're not closed
|
||||||
|
self.handle_startendtag(self, tag, attrs)
|
||||||
|
return
|
||||||
|
self.tag_stack.append(tag)
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
self.tag_stack.pop()
|
||||||
|
|
||||||
|
def handle_startendtag(self, tag, attrs):
|
||||||
|
if tag == 'br':
|
||||||
|
self.output += '\n'
|
||||||
|
if tag == 'hr':
|
||||||
|
self.output += '\n─────────────────────\n'
|
||||||
|
if tag == 'img':
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def html_to_paragraphs(html):
|
def html_to_paragraphs(html):
|
||||||
"""Parse html properly.
|
"""Parse html properly.
|
||||||
|
|
||||||
|
@ -166,7 +204,10 @@ def html_to_paragraphs(html):
|
||||||
parsed = ''
|
parsed = ''
|
||||||
links = []
|
links = []
|
||||||
images = []
|
images = []
|
||||||
|
parser = HTMLTextParser()
|
||||||
for element in soup:
|
for element in soup:
|
||||||
|
element_text = str(element)
|
||||||
|
print(parser.feed(element_text))
|
||||||
text = parse_element(element, links, images)
|
text = parse_element(element, links, images)
|
||||||
parsed += text
|
parsed += text
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue