better formatting for titles

parse_source option to extract title
try zero-length titles, add source:title extension
2022-12-20 11:00:47 -08:00 · 2022-12-20 09:01:05 -08:00 · 2022-12-19 17:50:46 -08:00 · 2022-12-05 09:56:30 -08:00
11 changed files with 251 additions and 27 deletions
--- a/config/config.exs
+++ b/config/config.exs
@ -282,8 +282,11 @@

 config :pleroma, :feed,
  post_title: %{
+    # Set max_length to 0 to suppress titles (Dave Winer suggestion)
    max_length: 100,
-    omission: "..."
+    omission: "...",
+    # New method to extract title
+    parse_source: false
  }

 config :pleroma, :markup,
--- a/config/description.exs
+++ b/config/description.exs
@ -2919,6 +2919,7 @@
  %{
    group: :pleroma,
    key: :feed,
+    label: "RSS Feeds",
    type: :group,
    description: "Configure feed rendering",
    children: [
@ -2938,7 +2939,13 @@
            type: :string,
            description: "Replacement which will be used after truncating string",
            suggestions: ["..."]
-          }
+          },
+          %{
+            key: :parse_source,
+            type: :boolean,
+            description: "Use content type-specific parsers to extract title (ignores max_length)",
+            suggestions: [true]
+          },
        ]
      }
    ]
--- a/lib/pleroma/formatter.ex
+++ b/lib/pleroma/formatter.ex
@ -150,15 +150,19 @@ def html_escape(text, "text/plain") do
    |> Enum.join("")
  end

-  def truncate(text, max_length \\ 200, omission \\ "...") do
+  def truncate(text, max_length \\ 200, omission \\ "...") when max_length >= 0 do
    # Remove trailing whitespace
    text = Regex.replace(~r/([^ \t\r\n])([ \t]+$)/u, text, "\\g{1}")

-    if String.length(text) < max_length do
-      text
-    else
-      length_with_omission = max_length - String.length(omission)
-      String.slice(text, 0, length_with_omission) <> omission
+    length_with_omission = max_length - String.length(omission)
+
+    cond do
+      String.length(text) <= max_length ->
+        text
+      length_with_omission > 0 ->
+        String.slice(text, 0, length_with_omission) <> omission
+      true ->
+        String.slice(text, 0, max_length)
    end
  end

--- a/lib/pleroma/web/feed/feed_view.ex
+++ b/lib/pleroma/web/feed/feed_view.ex
@ -12,6 +12,7 @@ defmodule Pleroma.Web.Feed.FeedView do
  alias Pleroma.Web.Gettext
  alias Pleroma.Web.MediaProxy

+  require Logger
  require Pleroma.Constants

  @spec pub_date(String.t() | DateTime.t()) :: String.t()
@ -69,7 +70,24 @@ def logo(user) do

  def last_activity(activities), do: List.last(activities)

-  def activity_title(%{"content" => content}, opts \\ %{}) do
+  def maybe_activity_title(activity, opts \\ %{}) do
+    case activity_title(activity, opts) do
+      "" -> ""
+      title -> "<title>#{title}</title>"
+    end
+  end
+
+  def activity_title(activity, opts \\ %{})
+
+  def activity_title(
+        %{"source" => %{"mediaType" => content_type, "content" => content}},
+        %{parse_source: true} = opts
+      ) do
+    split_content(content, content_type, opts) |> elem(0)
+  end
+
+  # TODO: scrub_html should replace <p> with " "
+  def activity_title(%{"content" => content}, opts) do
    content
    |> Pleroma.Web.Metadata.Utils.scrub_html()
    |> Pleroma.Emoji.Formatter.demojify()
@ -77,13 +95,36 @@ def activity_title(%{"content" => content}, opts \\ %{}) do
    |> escape()
  end

-  def activity_content(%{"content" => content}) do
-    content
-    |> String.replace(~r/[\n\r]/, "")
+  def activity_title(_, _), do: ""
+
+  def activity_content(activity, opts \\ %{})
+
+  def activity_content(
+        %{"source" => %{"mediaType" => content_type, "content" => content}},
+        %{parse_source: true} = opts
+      ) do
+    start = split_content(content, content_type, opts) |> elem(1)
+    length = String.length(content)
+
+    {text, _mentions, _tags} =
+      String.slice(content, start, length)
+      |> Pleroma.Web.CommonAPI.Utils.format_input(content_type)
+
+    text
+    |> String.replace(~r/(\r?\n)+/, " ")
+    |> String.trim()
    |> escape()
  end

-  def activity_content(_), do: ""
+  def activity_content(%{"content" => content}, _opts) do
+    # Replace 1 or more newlines with 1 space
+    content
+    |> String.replace(~r/(\r?\n)+/, " ")
+    |> String.trim()
+    |> escape()
+  end
+
+  def activity_content(_, _), do: ""

  def activity_context(activity), do: escape(activity.data["context"])

@ -99,6 +140,117 @@ def attachment_type(attachment) do
    |> Map.get("mediaType")
  end

+  def source_content(%{"source" => %{"mediaType" => _, "content" => content}}) do
+    xml_escape(content)
+  end
+
+  def parse_title(activity, opts \\ %{})
+
+  def parse_title(%{"source" => %{"mediaType" => content_type, "content" => content}}, opts) do
+    split_content(content, content_type, opts) |> elem(0)
+  end
+
+  def parse_title(_activity, _opts), do: 0
+
+  def parse_description_offset(activity, opts \\ %{})
+
+  def parse_description_offset(
+        %{"source" => %{"mediaType" => content_type, "content" => content}},
+        opts
+      ) do
+    split_content(content, content_type, opts) |> elem(1)
+  end
+
+  def parse_description_offset(_activity, _opts), do: 0
+
+  @spec split_content(binary(), binary(), any()) :: {binary(), non_neg_integer()}
+  def split_content(content, "text/html", _opts) do
+    case Regex.named_captures(
+           ~r/^[ \t]*<(?<tag>h[12])([ \t][^>]+)?>(?<title_inner>[^<]+)<\/h[12](?<title_end>[ \t]*>)/im,
+           content,
+           return: :index
+         ) do
+      %{
+        "title_inner" => {title_start, title_length},
+        "title_end" => {title_end_start, title_end_length}
+      } ->
+        title = String.slice(content, title_start, title_length) |> String.trim()
+        {title, title_end_start + title_end_length}
+
+      _ ->
+        Logger.error("No H1/H2 match")
+        {"", 0}
+    end
+  end
+
+  def split_content(content, "text/plain", opts) do
+    content
+    |> split_text_lines(opts)
+    |> case do
+      {_str, 0, 0} ->
+        {"", 0}
+
+      {str, te, ds} ->
+        {str
+         |> String.split_at(te)
+         |> elem(0)
+         |> String.trim(), ds}
+    end
+  end
+
+  def split_content(content, "text/bbcode", opts) do
+    content
+    |> split_text_lines(opts)
+    |> match_title(~r/\[b\](?<title>[^\[]+)\[\/b\]/)
+  end
+
+  def split_content(content, "text/markdown", opts) do
+    content
+    |> split_text_lines(opts)
+    |> match_title(~r/^[#]{1,2}[ \t](?<title>.+)/)
+  end
+
+  def split_content(content, "text/x.misskeymarkdown", opts) do
+    content
+    |> split_text_lines(opts)
+    |> match_title(~r/^\*\*(?<title>.+)\*\*/)
+  end
+
+  def split_content(_, _, _), do: {0, 0}
+
+  def split_text_lines(str, _opts) do
+    case String.split(str, ~r/\n/) do
+      [] ->
+        {str, 0, 0}
+
+      [_first, _rest] ->
+        {str, 0, 0}
+
+      [first | [second | _rest]] ->
+        title_end = String.trim_trailing(first) |> String.length()
+        description_start = String.length(first) + String.length(second)
+        {str, title_end, description_start}
+    end
+  end
+
+  def match_title({_str, 0, 0}, _regex), do: {"", 0}
+
+  def match_title({str, te, ds}, regex) do
+    maybe_title =
+      str
+      |> String.split_at(te)
+      |> elem(0)
+      |> String.trim()
+
+    case Regex.named_captures(regex, maybe_title) do
+      %{"title" => title} ->
+        {String.trim(title), ds}
+
+      _ ->
+        {"", 0}
+    end
+  end
+
  def get_href(id) do
    with %Object{data: %{"external_url" => external_url}} <- Object.get_cached_by_ap_id(id) do
      external_url
@ -112,4 +264,37 @@ def escape(html) do
    |> html_escape()
    |> safe_to_string()
  end
+
+  def cdata(str), do: "<![CDATA[" <> str <> "]]>"
+
+  def xml_escape(nil), do: ""
+
+  def xml_escape(str) when is_binary(str) do
+    str
+    |> xml_escape_string()
+    |> to_string()
+  end
+
+  defp xml_escape_string(""), do: ""
+  defp xml_escape_string(<<"&"::utf8, rest::binary>>), do: xml_escape_entity(rest)
+  defp xml_escape_string(<<"<"::utf8, rest::binary>>), do: ["&lt;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<">"::utf8, rest::binary>>), do: ["&gt;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<"\t"::utf8, rest::binary>>), do: ["&#9;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<"\n"::utf8, rest::binary>>), do: ["&#10;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<"\r"::utf8, rest::binary>>), do: ["&#13;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<"\""::utf8, rest::binary>>), do: ["&#22;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<"\'"::utf8, rest::binary>>), do: ["&#39;" | xml_escape_string(rest)]
+  defp xml_escape_string(<<c::utf8, rest::binary>>), do: [c | xml_escape_string(rest)]
+
+  defp xml_escape_entity(<<"amp;"::utf8, rest::binary>>), do: ["&amp;" | xml_escape_string(rest)]
+  defp xml_escape_entity(<<"lt;"::utf8, rest::binary>>), do: ["&lt;" | xml_escape_string(rest)]
+  defp xml_escape_entity(<<"gt;"::utf8, rest::binary>>), do: ["&gt;" | xml_escape_string(rest)]
+
+  defp xml_escape_entity(<<"quot;"::utf8, rest::binary>>),
+    do: ["&quot;" | xml_escape_string(rest)]
+
+  defp xml_escape_entity(<<"apos;"::utf8, rest::binary>>),
+    do: ["&apos;" | xml_escape_string(rest)]
+
+  defp xml_escape_entity(rest), do: ["&amp;" | xml_escape_string(rest)]
 end
--- a/lib/pleroma/web/metadata/utils.ex
+++ b/lib/pleroma/web/metadata/utils.ex
@ -42,8 +42,12 @@ def scrub_html(content) when is_binary(content) do
    content
    # html content comes from DB already encoded, decode first and scrub after
    |> HtmlEntities.decode()
-    |> String.replace(~r/<br\s?\/?>/, " ")
+    |> String.replace(~r/<(br|p)[^>]*>/, "\\0&nbsp;")
+    |> String.replace(~r/<\/p\s*\/?>/, "&nbsp;\\0")
    |> HTML.strip_tags()
+    # strip_tags will convert &nbsp; to U+00A0, adding /u will match these to " "
+    |> String.replace(~r/\s+/u, " ")
+    |> String.trim()
  end

  def scrub_html(content), do: content
--- a/lib/pleroma/web/templates/feed/feed/_activity.atom.eex
+++ b/lib/pleroma/web/templates/feed/feed/_activity.atom.eex
@ -2,8 +2,10 @@
  <activity:object-type>http://activitystrea.ms/schema/1.0/note</activity:object-type>
  <activity:verb>http://activitystrea.ms/schema/1.0/post</activity:verb>
  <id><%= @data["id"] %></id>
-  <title><%= activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></title>
-  <content type="html"><%= activity_content(@data) %></content>
+
+  <%= maybe_activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %>
+
+  <content type="html"><%= activity_content(@data, Keyword.get(@feed_config, :post_title, %{})) %></content>
  <published><%= @activity.data["published"] %></published>
  <updated><%= @activity.data["published"] %></updated>
  <ostatus:conversation ref="<%= activity_context(@activity) %>">
@ -15,6 +17,13 @@
    <summary><%= escape(@data["summary"]) %></summary>
  <% end %>

+  <%= if !is_nil(get_in(@data, ["source", "mediaType"])) do %>
+    <source:contentType><%= get_in(@data, ["source", "mediaType"]) %></source:contentType>
+    <source:content><%= source_content(@data) %></source:content>
+    <source:title><%= parse_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></source:titleEnd>
+    <source:descriptionOffset><%= parse_description_offset(@data, Keyword.get(@feed_config, :post_title, %{})) %></source:descriptionStart>
+  <% end %>
+
  <%= if @activity.local do %>
    <link type="application/atom+xml" href='<%= @data["id"] %>' rel="self"/>
    <link type="text/html" href='<%= @data["id"] %>' rel="alternate"/>
--- a/lib/pleroma/web/templates/feed/feed/_activity.rss.eex
+++ b/lib/pleroma/web/templates/feed/feed/_activity.rss.eex
@ -2,8 +2,10 @@
  <activity:object-type>http://activitystrea.ms/schema/1.0/note</activity:object-type>
  <activity:verb>http://activitystrea.ms/schema/1.0/post</activity:verb>
  <guid><%= @data["id"] %></guid>
-  <title><%= activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></title>
-  <description><%= activity_content(@data) %></description>
+
+  <%= maybe_activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %>
+
+  <description><%= activity_content(@data, Keyword.get(@feed_config, :post_title, %{})) %></description>
  <pubDate><%= @activity.data["published"] %></pubDate>
  <updated><%= @activity.data["published"] %></updated>
  <ostatus:conversation ref="<%= activity_context(@activity) %>">
@ -11,7 +13,14 @@
  </ostatus:conversation>

  <%= if @data["summary"] do %>
-    <description><%= escape(@data["summary"]) %></description>
+    <masto:summary><%= escape(@data["summary"]) %></masto:summary>
+  <% end %>
+
+  <%= if !is_nil(get_in(@data, ["source", "mediaType"])) do %>
+    <source:contentType><%= get_in(@data, ["source", "mediaType"]) %></source:contentType>
+    <source:content><%= source_content(@data) %></source:content>
+    <source:title><%= parse_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></source:titleEnd>
+    <source:descriptionOffset><%= parse_description_offset(@data, Keyword.get(@feed_config, :post_title, %{})) %></source:descriptionStart>
  <% end %>

  <%= if @activity.local do %>
--- a/lib/pleroma/web/templates/feed/feed/_tag_activity.atom.eex
+++ b/lib/pleroma/web/templates/feed/feed/_tag_activity.atom.eex
@ -5,8 +5,10 @@
    <%= render @view_module, "_tag_author.atom", assigns %>

    <id><%= @data["id"] %></id>
-    <title><%= activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></title>
-    <content type="html"><%= activity_content(@data) %></content>
+
+    <%= maybe_activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %>
+
+    <content type="html"><%= activity_content(@data, Keyword.get(@feed_config, :post_title, %{})) %></content>

  <%= if @activity.local do %>
    <link type="application/atom+xml" href='<%= @data["id"] %>' rel="self"/>
--- a/lib/pleroma/web/templates/feed/feed/_tag_activity.xml.eex
+++ b/lib/pleroma/web/templates/feed/feed/_tag_activity.xml.eex
@ -1,12 +1,11 @@
 <item>
-  <title><%= activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %></title>
-
-
  <guid isPermalink="true"><%= activity_context(@activity) %></guid>
  <link><%= activity_context(@activity) %></link>
  <pubDate><%= pub_date(@activity.data["published"]) %></pubDate>

-  <description><%= activity_content(@data) %></description>
+  <%= maybe_activity_title(@data, Keyword.get(@feed_config, :post_title, %{})) %>
+
+  <description><%= activity_content(@data, Keyword.get(@feed_config, :post_title, %{})) %></description>
  <%= for attachment <- @data["attachment"] || [] do %>
    <enclosure url="<%= attachment_href(attachment) %>" type="<%= attachment_type(attachment) %>"/>
  <% end %>
--- a/lib/pleroma/web/templates/feed/feed/tag.atom.eex
+++ b/lib/pleroma/web/templates/feed/feed/tag.atom.eex
@ -7,7 +7,8 @@
      xmlns:media="http://purl.org/syndication/atommedia"
      xmlns:poco="http://portablecontacts.net/spec/1.0"
      xmlns:ostatus="http://ostatus.org/schema/1.0"
-      xmlns:statusnet="http://status.net/schema/api/1/">
+      xmlns:statusnet="http://status.net/schema/api/1/"
+      xmlns:source="http://source.scripting.com/">

    <id><%= '#{Routes.tag_feed_url(@conn, :feed, @tag)}.rss' %></id>
    <title>#<%= @tag %></title>
--- a/lib/pleroma/web/templates/feed/feed/user.atom.eex
+++ b/lib/pleroma/web/templates/feed/feed/user.atom.eex
@ -4,7 +4,8 @@
  xmlns:thr="http://purl.org/syndication/thread/1.0"
  xmlns:activity="http://activitystrea.ms/spec/1.0/"
  xmlns:poco="http://portablecontacts.net/spec/1.0"
-  xmlns:ostatus="http://ostatus.org/schema/1.0">
+  xmlns:ostatus="http://ostatus.org/schema/1.0"
+  xmlns:source="http://source.scripting.com/">

  <id><%= Routes.user_feed_url(@conn, :feed, @user.nickname) <> ".atom" %></id>
  <title><%= @user.nickname <> "'s timeline" %></title>
Author	SHA1	Message	Date
Peter Zingg	b0532e136b	better formatting for titles	2022-12-20 11:00:47 -08:00
Peter Zingg	8b488899aa	parse_source option to extract title	2022-12-20 09:01:05 -08:00
Peter Zingg	c483c7dac4	try zero-length titles, add source:title extension	2022-12-19 17:50:46 -08:00
Peter Zingg	eed5d7dc76	support source:markdown in RSS feed	2022-12-05 09:56:30 -08:00