From a6df71eebb6fcb29460b510315048cea4cb32f07 Mon Sep 17 00:00:00 2001 From: Oneric Date: Sat, 3 Feb 2024 17:30:00 +0100 Subject: [PATCH 1/5] Don't add summary metrics to prometheus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The exporter doesn’t support them thus we don't lose anything by this, but it avoids a bunch of warnings each time the server starts up. --- lib/pleroma/web/telemetry.ex | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/pleroma/web/telemetry.ex b/lib/pleroma/web/telemetry.ex index b03850600..eecaffe88 100644 --- a/lib/pleroma/web/telemetry.ex +++ b/lib/pleroma/web/telemetry.ex @@ -101,6 +101,7 @@ defmodule Pleroma.Web.Telemetry do ] end + # Summary metrics are currently not (yet) supported by the prometheus exporter defp summary_metrics do [ # Phoenix Metrics @@ -121,7 +122,12 @@ defmodule Pleroma.Web.Telemetry do summary("vm.memory.total", unit: {:byte, :kilobyte}), summary("vm.total_run_queue_lengths.total"), summary("vm.total_run_queue_lengths.cpu"), - summary("vm.total_run_queue_lengths.io"), + summary("vm.total_run_queue_lengths.io") + ] + end + + defp common_metrics do + [ last_value("pleroma.local_users.total"), last_value("pleroma.domains.total"), last_value("pleroma.local_statuses.total"), @@ -129,8 +135,8 @@ defmodule Pleroma.Web.Telemetry do ] end - def prometheus_metrics, do: summary_metrics() ++ distribution_metrics() - def live_dashboard_metrics, do: summary_metrics() + def prometheus_metrics, do: common_metrics() ++ distribution_metrics() + def live_dashboard_metrics, do: common_metrics() ++ summary_metrics() defp periodic_measurements do [ From 18ecae61839dd3d6e4d2aaaf2d0c01bc448fcfb5 Mon Sep 17 00:00:00 2001 From: Oneric Date: Sat, 3 Feb 2024 17:51:40 +0100 Subject: [PATCH 2/5] Use fully qualified function capture for telementry event Otherwise we get warnings on startup as local captures and anonymous functions are supposedly less performant. --- lib/pleroma/job_queue_monitor.ex | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/lib/pleroma/job_queue_monitor.ex b/lib/pleroma/job_queue_monitor.ex index b5f124923..8d81ffcac 100644 --- a/lib/pleroma/job_queue_monitor.ex +++ b/lib/pleroma/job_queue_monitor.ex @@ -15,8 +15,19 @@ defmodule Pleroma.JobQueueMonitor do @impl true def init(state) do - :telemetry.attach("oban-monitor-failure", [:oban, :job, :exception], &handle_event/4, nil) - :telemetry.attach("oban-monitor-success", [:oban, :job, :stop], &handle_event/4, nil) + :telemetry.attach( + "oban-monitor-failure", + [:oban, :job, :exception], + &Pleroma.JobQueueMonitor.handle_event/4, + nil + ) + + :telemetry.attach( + "oban-monitor-success", + [:oban, :job, :stop], + &Pleroma.JobQueueMonitor.handle_event/4, + nil + ) {:ok, state} end From 8f8e1ff2145d4b48b6c45cf46bc8f85e6222c7c0 Mon Sep 17 00:00:00 2001 From: Oneric Date: Thu, 8 Feb 2024 00:10:46 +0100 Subject: [PATCH 3/5] Purge unused function scrub_css Commit e9f1897cfdb32c890e9eaf2e894128be5c7e1123 added this private function but it never had any users resulting in warnings each startup --- priv/scrubbers/default.ex | 2 -- 1 file changed, 2 deletions(-) diff --git a/priv/scrubbers/default.ex b/priv/scrubbers/default.ex index 6a97cbfd4..74de910fd 100644 --- a/priv/scrubbers/default.ex +++ b/priv/scrubbers/default.ex @@ -128,6 +128,4 @@ defmodule Pleroma.HTML.Scrubber.Default do Meta.allow_tag_with_these_attributes(:small, []) Meta.strip_everything_not_covered() - - defp scrub_css(value), do: value end From 16197ff57a181c4202317519b33d19826b53fbba Mon Sep 17 00:00:00 2001 From: Oneric Date: Sat, 3 Feb 2024 18:21:09 +0100 Subject: [PATCH 4/5] Display memory as MB in live dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With kilobyte the resulting numbers got too large and were cut off in the charts, making them useless. However, even an idle Akkoma server’s memory usage is in the lower hundreths of megabytes, so we don’t need this much precision to begin with for the dashboard. Other metric users might prefer base units and can handle scaling in a smarter way, so keep this configurable. --- lib/pleroma/web/telemetry.ex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/pleroma/web/telemetry.ex b/lib/pleroma/web/telemetry.ex index eecaffe88..3ea88b31d 100644 --- a/lib/pleroma/web/telemetry.ex +++ b/lib/pleroma/web/telemetry.ex @@ -102,7 +102,7 @@ defmodule Pleroma.Web.Telemetry do end # Summary metrics are currently not (yet) supported by the prometheus exporter - defp summary_metrics do + defp summary_metrics(byte_unit) do [ # Phoenix Metrics summary("phoenix.endpoint.stop.duration", @@ -119,7 +119,7 @@ defmodule Pleroma.Web.Telemetry do summary("pleroma.repo.query.idle_time", unit: {:native, :millisecond}), # VM Metrics - summary("vm.memory.total", unit: {:byte, :kilobyte}), + summary("vm.memory.total", unit: {:byte, byte_unit}), summary("vm.total_run_queue_lengths.total"), summary("vm.total_run_queue_lengths.cpu"), summary("vm.total_run_queue_lengths.io") @@ -136,7 +136,7 @@ defmodule Pleroma.Web.Telemetry do end def prometheus_metrics, do: common_metrics() ++ distribution_metrics() - def live_dashboard_metrics, do: common_metrics() ++ summary_metrics() + def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte) defp periodic_measurements do [ From 29f564f700e4c6998f0bbb830128e9d520cd1905 Mon Sep 17 00:00:00 2001 From: Oneric Date: Sat, 3 Feb 2024 18:28:55 +0100 Subject: [PATCH 5/5] Use fallbacks of summary metrics for prometheus --- lib/pleroma/web/telemetry.ex | 87 +++++++++++++++++++++++++++++++++++- 1 file changed, 86 insertions(+), 1 deletion(-) diff --git a/lib/pleroma/web/telemetry.ex b/lib/pleroma/web/telemetry.ex index 3ea88b31d..269f9f238 100644 --- a/lib/pleroma/web/telemetry.ex +++ b/lib/pleroma/web/telemetry.ex @@ -126,6 +126,89 @@ defmodule Pleroma.Web.Telemetry do ] end + defp sum_counter_pair(basename, opts) do + [ + sum(basename <> ".psum", opts), + counter(basename <> ".pcount", opts) + ] + end + + # Prometheus exporter doesn't support summaries, so provide fallbacks + defp summary_fallback_metrics(byte_unit \\ :byte) do + # Summary metrics are not supported by the Prometheus exporter + # https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/11 + # and sum metrics currently only work with integers + # https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/35 + # + # For VM metrics this is kindof ok as they appear to always be integers + # and we can use sum + counter to get the average between polls from their change + # But for repo query times we need to use a full distribution + + simple_buckets = [0, 1, 2, 4, 8, 16] + simple_buckets_quick = for t <- simple_buckets, do: t / 100.0 + + # Already included in distribution metrics anyway: + # phoenix.router_dispatch.stop.duration + # pleroma.repo.query.total_time + # pleroma.repo.query.queue_time + dist_metrics = + [ + distribution("phoenix.endpoint.stop.duration.fdist", + event_name: [:phoenix, :endpoint, :stop], + measurement: :duration, + unit: {:native, :millisecond}, + reporter_options: [ + buckets: simple_buckets + ] + ), + distribution("pleroma.repo.query.decode_time.fdist", + event_name: [:pleroma, :repo, :query], + measurement: :decode_time, + unit: {:native, :millisecond}, + reporter_options: [ + buckets: simple_buckets_quick + ] + ), + distribution("pleroma.repo.query.query_time.fdist", + event_name: [:pleroma, :repo, :query], + measurement: :query_time, + unit: {:native, :millisecond}, + reporter_options: [ + buckets: simple_buckets + ] + ), + distribution("pleroma.repo.query.idle_time.fdist", + event_name: [:pleroma, :repo, :query], + measurement: :idle_time, + unit: {:native, :millisecond}, + reporter_options: [ + buckets: simple_buckets + ] + ) + ] + + vm_metrics = + sum_counter_pair("vm.memory.total", + event_name: [:vm, :memory], + measurement: :total, + unit: {:byte, byte_unit} + ) ++ + sum_counter_pair("vm.total_run_queue_lengths.total", + event_name: [:vm, :total_run_queue_lengths], + measurement: :total + ) ++ + sum_counter_pair("vm.total_run_queue_lengths.cpu", + event_name: [:vm, :total_run_queue_lengths], + measurement: :cpu + ) ++ + sum_counter_pair("vm.total_run_queue_lengths.io.fsum", + event_name: [:vm, :total_run_queue_lengths], + measurement: :io + ) + + dist_metrics ++ vm_metrics + end + defp common_metrics do [ last_value("pleroma.local_users.total"), @@ -135,7 +218,9 @@ defmodule Pleroma.Web.Telemetry do ] end - def prometheus_metrics, do: common_metrics() ++ distribution_metrics() + def prometheus_metrics, + do: common_metrics() ++ distribution_metrics() ++ summary_fallback_metrics() + def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte) defp periodic_measurements do