Update some markup

This commit is contained in:
Huy Ngo 2024-08-15 16:01:49 +07:00
parent e72be437e6
commit 28dd345219

View file

@ -5,6 +5,7 @@ import subprocess
import tempfile
import unicodedata
import warnings
from itertools import chain
from typing import Dict
from urllib.parse import quote, unquote, urlencode, urlparse
@ -41,7 +42,7 @@ def get_text(element, links, images):
text = element.text
if (element['href'] != element.text
and 'mention' not in element.get('class', [])
and 'tag' not in element.get('rel')):
and 'tag' not in element.get('rel', [])):
links.append(element['href'])
text = f'{text}[{len(links)}]'
return f'<cyan>{text}</cyan>'
@ -50,6 +51,51 @@ def get_text(element, links, images):
images.append(element['src'])
text = f'{text}[{len(images)}]'
return f'<cyan>{text}</cyan>'
if element.name in ('i', 'em'):
return f'<italic>{element.text}</italic>'
if element.name in ('b', 'strong'):
return f'<bold>{element.text}</bold>'
if element.name in ('s', 'del'):
return f'<strikethrough>{element.text}</strikethrough>'
if element.name == 'u':
return f'<underline>{element.text}</underline>'
for a in element.find_all('a'):
soup = BeautifulSoup("", "html.parser")
if (element['href'] != element.text
and 'mention' not in element.get('class', [])
and 'tag' not in element.get('rel', [])):
links.append(a.href)
new_tag = soup.new_tag('a')
new_tag.string = '&lt;cyan>' + a.text + '&lt;/cyan>'
a.replace_with(new_tag)
for img in element.find_all('img'):
soup = BeautifulSoup("", "html.parser")
text = element.get('alt', 'image')
images.append(element['src'])
text = f'{text}[{len(images)}]'
new_tag = soup.new_tag('span')
new_tag.string = '&lt;cyan>' + text + '&lt;/cyan>'
img.replace_with(new_tag)
for italic in chain(element.find_all('i'), element.find_all('em')):
soup = BeautifulSoup("", "html.parser")
new_tag = soup.new_tag('span')
new_tag.string = '&lt;italic>' + a.text + '&lt;/italic>'
italic.replace_with(new_tag)
for bold in chain(element.find_all('b'), element.find_all('strong')):
soup = BeautifulSoup("", "html.parser")
new_tag = soup.new_tag('span')
new_tag.string = '&lt;bold>' + a.text + '&lt;/bold>'
bold.replace_with(new_tag)
for underline in element.find_all('u'):
soup = BeautifulSoup("", "html.parser")
new_tag = soup.new_tag('span')
new_tag.string = '&lt;underline>' + a.text + '&lt;/underline>'
underline.replace_with(new_tag)
for strike in chain(element.find_all('s'), element.find_all('del')):
soup = BeautifulSoup("", "html.parser")
new_tag = soup.new_tag('span')
new_tag.string = '&lt;strikethrough>' + a.text + '&lt;/strikethrough>'
strike.replace_with(new_tag)
text = element.get_text()
text = text.replace('<', '\\<')
text = text.replace('&lt;', '<')