Implement nested list
This commit is contained in:
parent
4adaf21861
commit
e72be437e6
1 changed files with 29 additions and 49 deletions
|
@ -41,7 +41,7 @@ def get_text(element, links, images):
|
|||
text = element.text
|
||||
if (element['href'] != element.text
|
||||
and 'mention' not in element.get('class', [])
|
||||
and element.get('rel') != 'tag'):
|
||||
and 'tag' not in element.get('rel')):
|
||||
links.append(element['href'])
|
||||
text = f'{text}[{len(links)}]'
|
||||
return f'<cyan>{text}</cyan>'
|
||||
|
@ -50,52 +50,47 @@ def get_text(element, links, images):
|
|||
images.append(element['src'])
|
||||
text = f'{text}[{len(images)}]'
|
||||
return f'<cyan>{text}</cyan>'
|
||||
for a in element.find_all('a'):
|
||||
soup = BeautifulSoup("", "html.parser")
|
||||
if a['href'] != a.text and 'mention' not in a.get('class', []) and a.get('rel') != 'tag':
|
||||
links.append(a.href)
|
||||
new_tag = soup.new_tag('a')
|
||||
new_tag.string = '<cyan>' + a.text + '</cyan>'
|
||||
a.replace_with(new_tag)
|
||||
text = element.get_text()
|
||||
text = text.replace('<', '\\<')
|
||||
text = text.replace('<', '<')
|
||||
return unicodedata.normalize("NFKC", text)
|
||||
|
||||
|
||||
def parse_element(element, links, images):
|
||||
def parse_element(element, links, images, depth=0, list_type=None):
|
||||
"""Parse top level element."""
|
||||
texts = []
|
||||
texts = ''
|
||||
if element.name == 'ul':
|
||||
texts.append('\n')
|
||||
for li in element.find_all('li'):
|
||||
text = get_text(li, links, images)
|
||||
text = '- ' + text + '\n'
|
||||
texts.append(text)
|
||||
texts.append('\n')
|
||||
for li in element.children:
|
||||
text = parse_element(li, links, images, depth, '- ')
|
||||
texts += text
|
||||
elif element.name == 'ol':
|
||||
texts.append('\n')
|
||||
for i, li in enumerate(element.find_all('li')):
|
||||
text = get_text(li, links, images)
|
||||
text = str(i + 1) + '. ' + text + '\n'
|
||||
texts.append(text + '\n')
|
||||
texts.append('\n')
|
||||
for i, li in enumerate(element.children):
|
||||
text = parse_element(li, links, images, depth, str(i + 1) + '. ')
|
||||
texts += text
|
||||
elif element.name == 'li':
|
||||
text = '<dim>' + '··' * depth + '</dim>' + (list_type or '')
|
||||
for child in element.children:
|
||||
if child.name in ('ul', 'ol'):
|
||||
text += parse_element(child, links, images, depth + 1)
|
||||
else:
|
||||
text += parse_element(child, links, images, depth) + '\n'
|
||||
texts += text
|
||||
elif element.name == 'blockquote':
|
||||
texts.append('<yellow>quote:</yellow>\n')
|
||||
texts += '<yellow>quote:</yellow>\n'
|
||||
for child in element.children:
|
||||
text = parse_element(child, links, images)
|
||||
texts.extend(text)
|
||||
texts.append('<yellow>endquote</yellow>\n')
|
||||
texts += text
|
||||
texts += '<yellow>endquote</yellow>\n'
|
||||
elif element.name == 'p':
|
||||
for child in element.children:
|
||||
text = parse_element(child, links, images)
|
||||
texts.extend(text)
|
||||
texts.append('\n')
|
||||
texts += text
|
||||
texts += '\n'
|
||||
elif element.name == 'br':
|
||||
texts.append('\n')
|
||||
texts += '\n'
|
||||
else:
|
||||
text = get_text(element, links, images)
|
||||
texts.append(text)
|
||||
texts += text
|
||||
return texts
|
||||
|
||||
|
||||
|
@ -112,32 +107,17 @@ def html_to_paragraphs(html):
|
|||
- ul lists are presented with hyphen-bullet
|
||||
- ol lists are presented with numbers
|
||||
"""
|
||||
# paragraphs = re.split("</?p[^>]*>", html)
|
||||
|
||||
# # Convert <br>s to line breaks and remove empty paragraphs
|
||||
# paragraphs = [re.split("<br */?>", p) for p in paragraphs if p]
|
||||
|
||||
# # Convert each line in each paragraph to plain text:
|
||||
# return [[get_text(line) for line in p] for p in paragraphs]
|
||||
soup = parse_html(html)
|
||||
parsed = []
|
||||
parsed = ''
|
||||
links = []
|
||||
images = []
|
||||
for element in soup:
|
||||
text = parse_element(element, links, images)
|
||||
parsed.extend(text)
|
||||
parsed += text
|
||||
|
||||
paragraphs = []
|
||||
for text in parsed:
|
||||
if len(paragraphs) == 0:
|
||||
paragraphs.append(text)
|
||||
continue
|
||||
if paragraphs[-1].endswith('\n'):
|
||||
paragraphs.append(text)
|
||||
else:
|
||||
paragraphs[-1] += text
|
||||
if links or images:
|
||||
paragraphs.append('')
|
||||
if parsed[-1] == '\n':
|
||||
parsed = parsed[:-1]
|
||||
paragraphs = parsed.split('\n')
|
||||
for i, link in enumerate(links):
|
||||
paragraphs.append(f'[{i + 1}]: {link}')
|
||||
for i, link in enumerate(images):
|
||||
|
|
Loading…
Reference in a new issue