Implement nested list

This commit is contained in:
Huy Ngo 2024-08-15 15:40:26 +07:00
parent 4adaf21861
commit e72be437e6

View file

@ -41,7 +41,7 @@ def get_text(element, links, images):
text = element.text
if (element['href'] != element.text
and 'mention' not in element.get('class', [])
and element.get('rel') != 'tag'):
and 'tag' not in element.get('rel')):
links.append(element['href'])
text = f'{text}[{len(links)}]'
return f'<cyan>{text}</cyan>'
@ -50,52 +50,47 @@ def get_text(element, links, images):
images.append(element['src'])
text = f'{text}[{len(images)}]'
return f'<cyan>{text}</cyan>'
for a in element.find_all('a'):
soup = BeautifulSoup("", "html.parser")
if a['href'] != a.text and 'mention' not in a.get('class', []) and a.get('rel') != 'tag':
links.append(a.href)
new_tag = soup.new_tag('a')
new_tag.string = '&lt;cyan>' + a.text + '&lt;/cyan>'
a.replace_with(new_tag)
text = element.get_text()
text = text.replace('<', '\\<')
text = text.replace('&lt;', '<')
return unicodedata.normalize("NFKC", text)
def parse_element(element, links, images):
def parse_element(element, links, images, depth=0, list_type=None):
"""Parse top level element."""
texts = []
texts = ''
if element.name == 'ul':
texts.append('\n')
for li in element.find_all('li'):
text = get_text(li, links, images)
text = '- ' + text + '\n'
texts.append(text)
texts.append('\n')
for li in element.children:
text = parse_element(li, links, images, depth, '- ')
texts += text
elif element.name == 'ol':
texts.append('\n')
for i, li in enumerate(element.find_all('li')):
text = get_text(li, links, images)
text = str(i + 1) + '. ' + text + '\n'
texts.append(text + '\n')
texts.append('\n')
for i, li in enumerate(element.children):
text = parse_element(li, links, images, depth, str(i + 1) + '. ')
texts += text
elif element.name == 'li':
text = '<dim>' + '··' * depth + '</dim>' + (list_type or '')
for child in element.children:
if child.name in ('ul', 'ol'):
text += parse_element(child, links, images, depth + 1)
else:
text += parse_element(child, links, images, depth) + '\n'
texts += text
elif element.name == 'blockquote':
texts.append('<yellow>quote:</yellow>\n')
texts += '<yellow>quote:</yellow>\n'
for child in element.children:
text = parse_element(child, links, images)
texts.extend(text)
texts.append('<yellow>endquote</yellow>\n')
texts += text
texts += '<yellow>endquote</yellow>\n'
elif element.name == 'p':
for child in element.children:
text = parse_element(child, links, images)
texts.extend(text)
texts.append('\n')
texts += text
texts += '\n'
elif element.name == 'br':
texts.append('\n')
texts += '\n'
else:
text = get_text(element, links, images)
texts.append(text)
texts += text
return texts
@ -112,32 +107,17 @@ def html_to_paragraphs(html):
- ul lists are presented with hyphen-bullet
- ol lists are presented with numbers
"""
# paragraphs = re.split("</?p[^>]*>", html)
# # Convert <br>s to line breaks and remove empty paragraphs
# paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
# # Convert each line in each paragraph to plain text:
# return [[get_text(line) for line in p] for p in paragraphs]
soup = parse_html(html)
parsed = []
parsed = ''
links = []
images = []
for element in soup:
text = parse_element(element, links, images)
parsed.extend(text)
parsed += text
paragraphs = []
for text in parsed:
if len(paragraphs) == 0:
paragraphs.append(text)
continue
if paragraphs[-1].endswith('\n'):
paragraphs.append(text)
else:
paragraphs[-1] += text
if links or images:
paragraphs.append('')
if parsed[-1] == '\n':
parsed = parsed[:-1]
paragraphs = parsed.split('\n')
for i, link in enumerate(links):
paragraphs.append(f'[{i + 1}]: {link}')
for i, link in enumerate(images):