Replace ' by "'" before parsing HTML
Beautiful will does not parse HTML entities like `'` as we expect and the previous logic of replacing this *after* HTML parsing occurred did not produced expected results. To illustrate this, we change data in "test_timeline" to include a literal `'` as it sometimes occur in data returned by Mastodon API. New HTML content is: <p>The computer can't tell you the emotional story [...] </p> Beautiful will parse this as as: <p>The computer can&apost tell you the emotional story [...] </p> which is not what we expect. We fix this by replacing `'` *before* HTML parsing by Beautiful. Since test data in "test_timeline" got updated we also add an extra assertion checking that part of the content with a literal "'" is (still) properly rendered.
This commit is contained in:
parent
91fc273af7
commit
0f6bd920c3
3 changed files with 5 additions and 4 deletions
|
@ -126,7 +126,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
|
|||
'username': 'fz'
|
||||
},
|
||||
'created_at': '2017-04-12T15:53:18.174Z',
|
||||
'content': "<p>The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
|
||||
'content': "<p>The computer can't tell you the emotional story. It can give you the exact mathematical design, but what's missing is the eyebrows.</p>",
|
||||
'reblog': None,
|
||||
}])
|
||||
|
||||
|
@ -136,6 +136,7 @@ def test_timeline(mock_get, monkeypatch, capsys):
|
|||
|
||||
out, err = capsys.readouterr()
|
||||
assert "The computer can't tell you the emotional story." in out
|
||||
assert "but what's missing is the eyebrows." in out
|
||||
assert "Frank Zappa" in out
|
||||
assert "@fz" in out
|
||||
|
||||
|
|
|
@ -148,8 +148,8 @@ def print_timeline(items):
|
|||
content = item['reblog']['content'] if item['reblog'] else item['content']
|
||||
reblogged = item['reblog']['account']['username'] if item['reblog'] else None
|
||||
|
||||
soup = BeautifulSoup(content, "html.parser")
|
||||
text = soup.get_text().replace(''', "'")
|
||||
soup = BeautifulSoup(content.replace(''', "'"), "html.parser")
|
||||
text = soup.get_text()
|
||||
time = datetime.strptime(item['created_at'], "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||
|
||||
return {
|
||||
|
|
|
@ -11,7 +11,7 @@ from toot.exceptions import ConsoleError
|
|||
|
||||
def get_text(html):
|
||||
"""Converts html to text, strips all tags."""
|
||||
text = BeautifulSoup(html, "html.parser").get_text().replace(''', "'")
|
||||
text = BeautifulSoup(html.replace(''', "'"), "html.parser").get_text()
|
||||
|
||||
return unicodedata.normalize('NFKC', text)
|
||||
|
||||
|
|
Loading…
Reference in a new issue