Merge pull request #415 from ihabunek/danschwarz-richtext3

Add support for rich text
This commit is contained in:
Ivan Habunek 2023-11-18 15:40:35 +01:00 committed by GitHub
commit 317840b019
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 605 additions and 57 deletions

View file

@ -18,7 +18,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
pip install -e .\[richtext\]
pip install -r requirements-test.txt
- name: Run tests
run: |

View file

@ -2,4 +2,4 @@ requests>=2.13,<3.0
beautifulsoup4>=4.5.0,<5.0
wcwidth>=0.1.7
urwid>=2.0.0,<3.0
urwidgets>=0.1,<0.2

View file

@ -31,7 +31,7 @@ setup(
'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
'Programming Language :: Python :: 3',
],
packages=['toot', 'toot.tui', 'toot.utils'],
packages=['toot', 'toot.tui', 'toot.tui.richtext', 'toot.utils'],
python_requires=">=3.7",
install_requires=[
"requests>=2.13,<3.0",
@ -40,6 +40,9 @@ setup(
"urwid>=2.0.0,<3.0",
"tomlkit>=0.10.0,<1.0"
],
extras_require={
"richtext": ['urwidgets>=0.1,<0.2'],
},
entry_points={
'console_scripts': [
'toot=toot.console:main',

View file

@ -3,6 +3,7 @@ import pytest
from toot.console import duration
from toot.wcstring import wc_wrap, trunc, pad, fit_text
from toot.utils import urlencode_url
def test_pad():
@ -201,3 +202,8 @@ def test_duration():
with pytest.raises(ArgumentTypeError):
duration("banana")
def test_urlencode_url():
assert urlencode_url("https://www.example.com") == "https://www.example.com"
assert urlencode_url("https://www.example.com/url%20with%20spaces") == "https://www.example.com/url%20with%20spaces"

View file

@ -0,0 +1,45 @@
from urwid import Divider, Filler, Pile
from toot.tui.richtext import url_to_widget
from urwidgets import Hyperlink, TextEmbed
from toot.tui.richtext.richtext import html_to_widgets
def test_url_to_widget():
url = "http://foo.bar"
embed_widget = url_to_widget(url)
assert isinstance(embed_widget, TextEmbed)
[(filler, length)] = embed_widget.embedded
assert length == len(url)
assert isinstance(filler, Filler)
link_widget: Hyperlink = filler.base_widget
assert isinstance(link_widget, Hyperlink)
assert link_widget.attrib == "link"
assert link_widget.text == url
assert link_widget.uri == url
def test_html_to_widgets():
html = """
<p>foo</p>
<p>foo <b>bar</b> <i>baz</i></p>
""".strip()
[foo, divider, bar] = html_to_widgets(html)
assert isinstance(foo, Pile)
assert isinstance(divider, Divider)
assert isinstance(bar, Pile)
[foo_embed] = foo.widget_list
assert foo_embed.embedded == []
assert foo_embed.attrib == []
assert foo_embed.text == "foo"
[bar_embed] = bar.widget_list
assert bar_embed.embedded == []
assert bar_embed.attrib == [(None, 4), ("b", 3), (None, 1), ("i", 3)]
assert bar_embed.text == "foo bar baz"

View file

@ -6,7 +6,7 @@ import textwrap
from functools import lru_cache
from toot import settings
from toot.entities import Instance, Notification, Poll, Status
from toot.utils import get_text, parse_html
from toot.utils import get_text, html_to_paragraphs
from toot.wcstring import wc_wrap
from typing import List
from wcwidth import wcswidth
@ -321,7 +321,7 @@ def print_status(status: Status, width: int = 80):
def print_html(text, width=80):
first = True
for paragraph in parse_html(text):
for paragraph in html_to_paragraphs(text):
if not first:
print_out("")
for line in paragraph:

View file

@ -143,7 +143,6 @@ class TUI(urwid.Frame):
def run(self):
self.loop.set_alarm_in(0, lambda *args: self.async_load_instance())
self.loop.set_alarm_in(0, lambda *args: self.async_load_followed_accounts())
self.loop.set_alarm_in(0, lambda *args: self.async_load_followed_tags())
self.loop.set_alarm_in(0, lambda *args: self.async_load_timeline(
is_initial=True, timeline_name="home"))
self.loop.run()
@ -339,22 +338,6 @@ class TUI(urwid.Frame):
self.run_in_thread(_load_accounts, done_callback=_done_accounts)
def async_load_followed_tags(self):
def _load_tag_list():
try:
return api.followed_tags(self.app, self.user)
except ApiError:
# not supported by all Mastodon servers so fail silently if necessary
return []
def _done_tag_list(tags):
if len(tags) > 0:
self.followed_tags = [t["name"] for t in tags]
else:
self.followed_tags = []
self.run_in_thread(_load_tag_list, done_callback=_done_tag_list)
def refresh_footer(self, timeline):
"""Show status details in footer."""
status, index, count = timeline.get_focused_status_with_counts()

View file

@ -57,6 +57,29 @@ PALETTE = [
('dim', 'dark gray', ''),
('highlight', 'yellow', ''),
('success', 'dark green', ''),
# HTML tag styling
('a', ',italics', '', 'italics'),
# em tag is mapped to i
('i', ',italics', '', 'italics'),
# strong tag is mapped to b
('b', ',bold', '', 'bold'),
# special case for bold + italic nested tags
('bi', ',bold,italics', '', ',bold,italics'),
('u', ',underline', '', ',underline'),
('del', ',strikethrough', '', ',strikethrough'),
('code', 'light gray, standout', '', ',standout'),
('pre', 'light gray, standout', '', ',standout'),
('blockquote', 'light gray', '', ''),
('h1', ',bold', '', ',bold'),
('h2', ',bold', '', ',bold'),
('h3', ',bold', '', ',bold'),
('h4', ',bold', '', ',bold'),
('h5', ',bold', '', ',bold'),
('h6', ',bold', '', ',bold'),
('class_mention_hashtag', 'light cyan', '', ''),
('class_hashtag', 'light cyan', '', ''),
]
VISIBILITY_OPTIONS = [

View file

@ -4,10 +4,10 @@ import urwid
import webbrowser
from toot import __version__
from toot.utils import format_content
from .utils import highlight_hashtags, highlight_keys
from .widgets import Button, EditBox, SelectableText
from toot import api
from toot.tui.utils import highlight_keys
from toot.tui.widgets import Button, EditBox, SelectableText
from toot.tui.richtext import html_to_widgets
class StatusSource(urwid.Padding):
@ -279,8 +279,10 @@ class Account(urwid.ListBox):
if account["note"]:
yield urwid.Divider()
for line in format_content(account["note"]):
yield urwid.Text(highlight_hashtags(line, followed_tags=set()))
widgetlist = html_to_widgets(account["note"])
for line in widgetlist:
yield (line)
yield urwid.Divider()
yield urwid.Text(["ID: ", ("highlight", f"{account['id']}")])
@ -312,8 +314,11 @@ class Account(urwid.ListBox):
name = field["name"].title()
yield urwid.Divider()
yield urwid.Text([("bold", f"{name.rstrip(':')}"), ":"])
for line in format_content(field["value"]):
yield urwid.Text(highlight_hashtags(line, followed_tags=set()))
widgetlist = html_to_widgets(field["value"])
for line in widgetlist:
yield (line)
if field["verified_at"]:
yield urwid.Text(("success", "✓ Verified"))

View file

@ -2,11 +2,9 @@ import urwid
from toot import api
from toot.exceptions import ApiError
from toot.utils import format_content
from toot.utils.datetime import parse_datetime
from .utils import highlight_hashtags
from .widgets import Button, CheckBox, RadioButton
from .richtext import html_to_widgets
class Poll(urwid.ListBox):
@ -87,8 +85,11 @@ class Poll(urwid.ListBox):
def generate_contents(self, status):
yield urwid.Divider()
for line in format_content(status.data["content"]):
yield urwid.Text(highlight_hashtags(line, set()))
widgetlist = html_to_widgets(status.data["content"])
for line in widgetlist:
yield (line)
yield urwid.Divider()
yield self.build_linebox(self.generate_poll_detail())

View file

@ -0,0 +1,18 @@
import urwid
from toot.tui.utils import highlight_hashtags
from toot.utils import format_content
from typing import List
try:
from .richtext import html_to_widgets, url_to_widget
except ImportError:
# Fallback if urwidgets are not available
def html_to_widgets(html: str) -> List[urwid.Widget]:
return [
urwid.Text(highlight_hashtags(line))
for line in format_content(html)
]
def url_to_widget(url: str):
return urwid.Text(("link", url))

View file

@ -0,0 +1,452 @@
import re
import urwid
import unicodedata
from bs4.element import NavigableString, Tag
from toot.tui.constants import PALETTE
from toot.utils import parse_html, urlencode_url
from typing import List, Tuple
from urwid.util import decompose_tagmarkup
from urwidgets import Hyperlink, TextEmbed
STYLE_NAMES = [p[0] for p in PALETTE]
# NOTE: update this list if Mastodon starts supporting more block tags
BLOCK_TAGS = ["p", "pre", "li", "blockquote", "h1", "h2", "h3", "h4", "h5", "h6"]
def html_to_widgets(html, recovery_attempt=False) -> List[urwid.Widget]:
"""Convert html to urwid widgets"""
widgets: List[urwid.Widget] = []
html = unicodedata.normalize("NFKC", html)
soup = parse_html(html)
first_tag = True
for e in soup.body or soup:
if isinstance(e, NavigableString):
if first_tag and not recovery_attempt:
# if our first "tag" is a navigable string
# the HTML is out of spec, doesn't start with a tag,
# we see this in content from Pixelfed servers.
# attempt a fix by wrapping the HTML with <p></p>
return html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)
else:
continue
else:
name = e.name
# if our HTML starts with a tag, but not a block tag
# the HTML is out of spec. Attempt a fix by wrapping the
# HTML with <p></p>
if (first_tag and not recovery_attempt and name not in BLOCK_TAGS):
return html_to_widgets(f"<p>{html}</p>", recovery_attempt=True)
markup = render(name, e)
first_tag = False
if not isinstance(markup, urwid.Widget):
# plaintext, so create a padded text widget
txt = text_to_widget("", markup)
markup = urwid.Padding(
txt,
align="left",
width=("relative", 100),
min_width=None,
)
widgets.append(markup)
# separate top level widgets with a blank line
widgets.append(urwid.Divider(" "))
return widgets[:-1] # but suppress the last blank line
def url_to_widget(url: str):
widget = len(url), urwid.Filler(Hyperlink(url, "link", url))
return TextEmbed(widget)
def inline_tag_to_text(tag) -> Tuple:
"""Convert html tag to plain text with tag as attributes recursively"""
markups = process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
return (tag.name, markups)
def process_inline_tag_children(tag) -> List:
"""Recursively retrieve all children
and convert to a list of markup text"""
markups = []
for child in tag.children:
if isinstance(child, Tag):
markup = render(child.name, child)
markups.append(markup)
else:
markups.append(child)
return markups
URL_PATTERN = re.compile(r"(^.+)\x03(.+$)")
def text_to_widget(attr, markup) -> urwid.Widget:
markup_list = []
for run in markup:
if isinstance(run, tuple):
txt, attr_list = decompose_tagmarkup(run)
# find anchor titles with an ETX separator followed by href
match = URL_PATTERN.match(txt)
if match:
label, url = match.groups()
anchor_attr = get_best_anchor_attr(attr_list)
markup_list.append((
len(label),
urwid.Filler(Hyperlink(url, anchor_attr, label)),
))
else:
markup_list.append(run)
else:
markup_list.append(run)
return TextEmbed(markup_list)
def process_block_tag_children(tag) -> List[urwid.Widget]:
"""Recursively retrieve all children
and convert to a list of widgets
any inline tags containing text will be
converted to Text widgets"""
pre_widget_markups = []
post_widget_markups = []
child_widgets = []
found_nested_widget = False
for child in tag.children:
if isinstance(child, Tag):
# child is a nested tag; process using custom method
# or default to inline_tag_to_text
result = render(child.name, child)
if isinstance(result, urwid.Widget):
found_nested_widget = True
child_widgets.append(result)
else:
if not found_nested_widget:
pre_widget_markups.append(result)
else:
post_widget_markups.append(result)
else:
# child is text; append to the appropriate markup list
if not found_nested_widget:
pre_widget_markups.append(child)
else:
post_widget_markups.append(child)
widget_list = []
if len(pre_widget_markups):
widget_list.append(text_to_widget(tag.name, pre_widget_markups))
if len(child_widgets):
widget_list += child_widgets
if len(post_widget_markups):
widget_list.append(text_to_widget(tag.name, post_widget_markups))
return widget_list
def get_urwid_attr_name(tag) -> str:
"""Get the class name and translate to a
name suitable for use as an urwid
text attribute name"""
if "class" in tag.attrs:
clss = tag.attrs["class"]
if len(clss) > 0:
style_name = "class_" + "_".join(clss)
# return the class name, only if we
# find it as a defined palette name
if style_name in STYLE_NAMES:
return style_name
# fallback to returning the tag name
return tag.name
def basic_block_tag_handler(tag) -> urwid.Widget:
"""default for block tags that need no special treatment"""
return urwid.Pile(process_block_tag_children(tag))
def get_best_anchor_attr(attrib_list) -> str:
if not attrib_list:
return ""
flat_al = list(flatten(attrib_list))
for a in flat_al[0]:
# ref: https://docs.joinmastodon.org/spec/activitypub/
# these are the class names (translated to attrib names)
# that we can support for display
try:
if a[0] in ["class_hashtag", "class_mention_hashtag", "class_mention"]:
return a[0]
except KeyError:
continue
return "a"
def render(attr: str, content: str):
if attr in ["a"]:
return render_anchor(content)
if attr in ["blockquote"]:
return render_blockquote(content)
if attr in ["br"]:
return render_br(content)
if attr in ["em"]:
return render_em(content)
if attr in ["ol"]:
return render_ol(content)
if attr in ["pre"]:
return render_pre(content)
if attr in ["span"]:
return render_span(content)
if attr in ["b", "strong"]:
return render_strong(content)
if attr in ["ul"]:
return render_ul(content)
# Glitch-soc and Pleroma allow <H1>...<H6> in content
# Mastodon (PR #23913) does not; header tags are converted to <P><STRONG></STRONG></P>
if attr in ["p", "div", "li", "h1", "h2", "h3", "h4", "h5", "h6"]:
return basic_block_tag_handler(content)
# Fall back to inline_tag_to_text handler
return inline_tag_to_text(content)
def render_anchor(tag) -> Tuple:
"""anchor tag handler"""
markups = process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
href = tag.attrs["href"]
title, attrib_list = decompose_tagmarkup(markups)
if not attrib_list:
attrib_list = [tag]
if href:
# urlencode the path and query portions of the URL
href = urlencode_url(href)
# use ASCII ETX (end of record) as a
# delimiter between the title and the HREF
title += f"\x03{href}"
attr = get_best_anchor_attr(attrib_list)
if attr == "a":
# didn't find an attribute to use
# in the child markup, so let's
# try the anchor tag's own attributes
attr = get_urwid_attr_name(tag)
# hashtag anchors have a class of "mention hashtag"
# or "hashtag"
# we'll return style "class_mention_hashtag"
# or "class_hashtag"
# in that case; see corresponding palette entry
# in constants.py controlling hashtag highlighting
return (attr, title)
def render_blockquote(tag) -> urwid.Widget:
widget_list = process_block_tag_children(tag)
blockquote_widget = urwid.LineBox(
urwid.Padding(
urwid.Pile(widget_list),
align="left",
width=("relative", 100),
min_width=None,
left=1,
right=1,
),
tlcorner="",
tline="",
lline="",
trcorner="",
blcorner="",
rline="",
bline="",
brcorner="",
)
return urwid.Pile([urwid.AttrMap(blockquote_widget, "blockquote")])
def render_br(tag) -> Tuple:
return ("br", "\n")
def render_em(tag) -> Tuple:
# to simplify the number of palette entries
# translate EM to I (italic)
markups = process_inline_tag_children(tag)
if not markups:
return ("i", "")
# special case processing for bold and italic
for parent in tag.parents:
if parent.name == "b" or parent.name == "strong":
return ("bi", markups)
return ("i", markups)
def render_ol(tag) -> urwid.Widget:
"""ordered list tag handler"""
widgets = []
list_item_num = 1
increment = -1 if tag.has_attr("reversed") else 1
# get ol start= attribute if present
if tag.has_attr("start") and len(tag.attrs["start"]) > 0:
try:
list_item_num = int(tag.attrs["start"])
except ValueError:
pass
for li in tag.find_all("li", recursive=False):
markup = render("li", li)
# li value= attribute will change the item number
# it also overrides any ol start= attribute
if li.has_attr("value") and len(li.attrs["value"]) > 0:
try:
list_item_num = int(li.attrs["value"])
except ValueError:
pass
if not isinstance(markup, urwid.Widget):
txt = text_to_widget("li", [str(list_item_num), ". ", markup])
# 1. foo, 2. bar, etc.
widgets.append(txt)
else:
txt = text_to_widget("li", [str(list_item_num), ". "])
columns = urwid.Columns(
[txt, ("weight", 9999, markup)], dividechars=1, min_width=3
)
widgets.append(columns)
list_item_num += increment
return urwid.Pile(widgets)
def render_pre(tag) -> urwid.Widget:
# <PRE> tag spec says that text should not wrap,
# but horizontal screen space is at a premium
# and we have no horizontal scroll bar, so allow
# wrapping.
widget_list = [urwid.Divider(" ")]
widget_list += process_block_tag_children(tag)
pre_widget = urwid.Padding(
urwid.Pile(widget_list),
align="left",
width=("relative", 100),
min_width=None,
left=1,
right=1,
)
return urwid.Pile([urwid.AttrMap(pre_widget, "pre")])
def render_span(tag) -> Tuple:
markups = process_inline_tag_children(tag)
if not markups:
return (tag.name, "")
# span inherits its parent's class definition
# unless it has a specific class definition
# of its own
if "class" in tag.attrs:
# uncomment the following code to hide all HTML marked
# invisible (generally, the http:// prefix of URLs)
# could be a user preference, it's only advisable if
# the terminal supports OCS 8 hyperlinks (and that's not
# automatically detectable)
# if "invisible" in tag.attrs["class"]:
# return (tag.name, "")
style_name = get_urwid_attr_name(tag)
if style_name != "span":
# unique class name matches an entry in our palette
return (style_name, markups)
if tag.parent:
return (get_urwid_attr_name(tag.parent), markups)
else:
# fallback
return ("span", markups)
def render_strong(tag) -> Tuple:
# to simplify the number of palette entries
# translate STRONG to B (bold)
markups = process_inline_tag_children(tag)
if not markups:
return ("b", "")
# special case processing for bold and italic
for parent in tag.parents:
if parent.name == "i" or parent.name == "em":
return ("bi", markups)
return ("b", markups)
def render_ul(tag) -> urwid.Widget:
"""unordered list tag handler"""
widgets = []
for li in tag.find_all("li", recursive=False):
markup = render("li", li)
if not isinstance(markup, urwid.Widget):
txt = text_to_widget("li", ["\N{bullet} ", markup])
# * foo, * bar, etc.
widgets.append(txt)
else:
txt = text_to_widget("li", ["\N{bullet} "])
columns = urwid.Columns(
[txt, ("weight", 9999, markup)], dividechars=1, min_width=3
)
widgets.append(columns)
return urwid.Pile(widgets)
def flatten(data):
if isinstance(data, tuple):
for x in data:
yield from flatten(x)
else:
yield data

View file

@ -5,14 +5,14 @@ import webbrowser
from typing import List, Optional
from toot.tui import app
from toot.utils import format_content
from toot.tui.richtext import html_to_widgets, url_to_widget
from toot.utils.datetime import parse_datetime, time_ago
from toot.utils.language import language_name
from .entities import Status
from .scroll import Scrollable, ScrollBar
from .utils import highlight_hashtags, highlight_keys
from .widgets import SelectableText, SelectableColumns
from toot.entities import Status
from toot.tui.scroll import Scrollable, ScrollBar
from toot.tui.utils import highlight_keys
from toot.tui.widgets import SelectableText, SelectableColumns
logger = logging.getLogger("toot")
@ -310,7 +310,6 @@ class Timeline(urwid.Columns):
class StatusDetails(urwid.Pile):
def __init__(self, timeline: Timeline, status: Optional[Status]):
self.status = status
self.followed_tags = timeline.tui.followed_tags
self.followed_accounts = timeline.tui.followed_accounts
reblogged_by = status.author if status and status.reblog else None
@ -340,8 +339,10 @@ class StatusDetails(urwid.Pile):
yield ("pack", urwid.Text(("content_warning", "Marked as sensitive. Press S to view.")))
else:
content = status.original.translation if status.original.show_translation else status.data["content"]
for line in format_content(content):
yield ("pack", urwid.Text(highlight_hashtags(line, self.followed_tags)))
widgetlist = html_to_widgets(content)
for line in widgetlist:
yield (line)
media = status.data["media_attachments"]
if media:
@ -350,7 +351,7 @@ class StatusDetails(urwid.Pile):
yield ("pack", urwid.Text([("bold", "Media attachment"), " (", m["type"], ")"]))
if m["description"]:
yield ("pack", urwid.Text(m["description"]))
yield ("pack", urwid.Text(("link", m["url"])))
yield ("pack", url_to_widget(m["url"]))
poll = status.original.data.get("poll")
if poll:
@ -410,7 +411,7 @@ class StatusDetails(urwid.Pile):
if card["description"]:
yield urwid.Text(card["description"].strip())
yield urwid.Text("")
yield urwid.Text(("link", card["url"]))
yield url_to_widget(card["url"])
def poll_generator(self, poll):
for idx, option in enumerate(poll["options"]):

View file

@ -35,15 +35,12 @@ def highlight_keys(text, high_attr, low_attr=""):
return list(_gen())
def highlight_hashtags(line, followed_tags, attr="hashtag", followed_attr="hashtag_followed"):
def highlight_hashtags(line):
hline = []
for p in re.split(HASHTAG_PATTERN, line):
if p.startswith("#"):
if p[1:].lower() in (t.lower() for t in followed_tags):
hline.append((followed_attr, p))
else:
hline.append((attr, p))
hline.append(("hashtag", p))
else:
hline.append(p)

View file

@ -10,6 +10,7 @@ from bs4 import BeautifulSoup
from typing import Dict
from toot.exceptions import ConsoleError
from urllib.parse import urlparse, urlencode, quote, unquote
def str_bool(b):
@ -22,20 +23,22 @@ def str_bool_nullable(b):
return None if b is None else str_bool(b)
def get_text(html):
"""Converts html to text, strips all tags."""
def parse_html(html: str) -> BeautifulSoup:
# Ignore warnings made by BeautifulSoup, if passed something that looks like
# a file (e.g. a dot which matches current dict), it will warn that the file
# should be opened instead of passing a filename.
with warnings.catch_warnings():
warnings.simplefilter("ignore")
text = BeautifulSoup(html.replace('&apos;', "'"), "html.parser").get_text()
return unicodedata.normalize('NFKC', text)
return BeautifulSoup(html.replace("&apos;", "'"), "html.parser")
def parse_html(html):
def get_text(html):
"""Converts html to text, strips all tags."""
text = parse_html(html).get_text()
return unicodedata.normalize("NFKC", text)
def html_to_paragraphs(html):
"""Attempt to convert html to plain text while keeping line breaks.
Returns a list of paragraphs, each being a list of lines.
"""
@ -54,7 +57,7 @@ def format_content(content):
Returns a generator yielding lines of content.
"""
paragraphs = parse_html(content)
paragraphs = html_to_paragraphs(content)
first = True
@ -186,3 +189,14 @@ def _warn_scheme_deprecated():
"instead write:",
" toot instance http://unsafehost.com\n"
]))
def urlencode_url(url):
parsed_url = urlparse(url)
# unencode before encoding, to prevent double-urlencoding
encoded_path = quote(unquote(parsed_url.path), safe="-._~()'!*:@,;+&=/")
encoded_query = urlencode({k: quote(unquote(v), safe="-._~()'!*:@,;?/") for k, v in parsed_url.params})
encoded_url = parsed_url._replace(path=encoded_path, params=encoded_query).geturl()
return encoded_url