Merge pull request 'Clean up warnings (+fallback metrics)' (#685) from Oneric/akkoma:metrics into develop

Reviewed-on: AkkomaGang/akkoma#685
2024-02-17 11:41:10 +00:00 · 2024-02-17 11:41:10 +00:00 · 755c75d8a4
commit 755c75d8a4
parent 289f93f5a2 29f564f700
3 changed files with 109 additions and 9 deletions
--- a/lib/pleroma/job_queue_monitor.ex
+++ b/lib/pleroma/job_queue_monitor.ex
@ -15,8 +15,19 @@ def start_link(_) do

  @impl true
  def init(state) do
-    :telemetry.attach("oban-monitor-failure", [:oban, :job, :exception], &handle_event/4, nil)
-    :telemetry.attach("oban-monitor-success", [:oban, :job, :stop], &handle_event/4, nil)
+    :telemetry.attach(
+      "oban-monitor-failure",
+      [:oban, :job, :exception],
+      &Pleroma.JobQueueMonitor.handle_event/4,
+      nil
+    )
+
+    :telemetry.attach(
+      "oban-monitor-success",
+      [:oban, :job, :stop],
+      &Pleroma.JobQueueMonitor.handle_event/4,
+      nil
+    )

    {:ok, state}
  end
--- a/lib/pleroma/web/telemetry.ex
+++ b/lib/pleroma/web/telemetry.ex
@ -101,7 +101,8 @@ defp distribution_metrics do
    ]
  end

-  defp summary_metrics do
+  # Summary metrics are currently not (yet) supported by the prometheus exporter
+  defp summary_metrics(byte_unit) do
    [
      # Phoenix Metrics
      summary("phoenix.endpoint.stop.duration",
@ -118,10 +119,98 @@ defp summary_metrics do
      summary("pleroma.repo.query.idle_time", unit: {:native, :millisecond}),

      # VM Metrics
-      summary("vm.memory.total", unit: {:byte, :kilobyte}),
+      summary("vm.memory.total", unit: {:byte, byte_unit}),
      summary("vm.total_run_queue_lengths.total"),
      summary("vm.total_run_queue_lengths.cpu"),
-      summary("vm.total_run_queue_lengths.io"),
+      summary("vm.total_run_queue_lengths.io")
+    ]
+  end
+
+  defp sum_counter_pair(basename, opts) do
+    [
+      sum(basename <> ".psum", opts),
+      counter(basename <> ".pcount", opts)
+    ]
+  end
+
+  # Prometheus exporter doesn't support summaries, so provide fallbacks
+  defp summary_fallback_metrics(byte_unit \\ :byte) do
+    # Summary metrics are not supported by the Prometheus exporter
+    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/11
+    # and sum metrics currently only work with integers
+    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/35
+    #
+    # For VM metrics this is kindof ok as they appear to always be integers
+    # and we can use sum + counter to get the average between polls from their change
+    # But for repo query times we need to use a full distribution
+
+    simple_buckets = [0, 1, 2, 4, 8, 16]
+    simple_buckets_quick = for t <- simple_buckets, do: t / 100.0
+
+    # Already included in distribution metrics anyway:
+    #   phoenix.router_dispatch.stop.duration
+    #   pleroma.repo.query.total_time
+    #   pleroma.repo.query.queue_time
+    dist_metrics =
+      [
+        distribution("phoenix.endpoint.stop.duration.fdist",
+          event_name: [:phoenix, :endpoint, :stop],
+          measurement: :duration,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        ),
+        distribution("pleroma.repo.query.decode_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :decode_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets_quick
+          ]
+        ),
+        distribution("pleroma.repo.query.query_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :query_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        ),
+        distribution("pleroma.repo.query.idle_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :idle_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        )
+      ]
+
+    vm_metrics =
+      sum_counter_pair("vm.memory.total",
+        event_name: [:vm, :memory],
+        measurement: :total,
+        unit: {:byte, byte_unit}
+      ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.total",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :total
+        ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.cpu",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :cpu
+        ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.io.fsum",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :io
+        )
+
+    dist_metrics ++ vm_metrics
+  end
+
+  defp common_metrics do
+    [
      last_value("pleroma.local_users.total"),
      last_value("pleroma.domains.total"),
      last_value("pleroma.local_statuses.total"),
@ -129,8 +218,10 @@ defp summary_metrics do
    ]
  end

-  def prometheus_metrics, do: summary_metrics() ++ distribution_metrics()
-  def live_dashboard_metrics, do: summary_metrics()
+  def prometheus_metrics,
+    do: common_metrics() ++ distribution_metrics() ++ summary_fallback_metrics()
+
+  def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte)

  defp periodic_measurements do
    [
--- a/priv/scrubbers/default.ex
+++ b/priv/scrubbers/default.ex
@ -128,6 +128,4 @@ defmodule Pleroma.HTML.Scrubber.Default do
  Meta.allow_tag_with_these_attributes(:small, [])

  Meta.strip_everything_not_covered()
-
-  defp scrub_css(value), do: value
 end