From 29f564f700e4c6998f0bbb830128e9d520cd1905 Mon Sep 17 00:00:00 2001
From: Oneric <oneric@oneric.stub>
Date: Sat, 3 Feb 2024 18:28:55 +0100
Subject: [PATCH] Use fallbacks of summary metrics for prometheus

---
 lib/pleroma/web/telemetry.ex | 87 +++++++++++++++++++++++++++++++++++-
 1 file changed, 86 insertions(+), 1 deletion(-)

diff --git a/lib/pleroma/web/telemetry.ex b/lib/pleroma/web/telemetry.ex
index 3ea88b31d..269f9f238 100644
--- a/lib/pleroma/web/telemetry.ex
+++ b/lib/pleroma/web/telemetry.ex
@@ -126,6 +126,89 @@ defp summary_metrics(byte_unit) do
     ]
   end
 
+  defp sum_counter_pair(basename, opts) do
+    [
+      sum(basename <> ".psum", opts),
+      counter(basename <> ".pcount", opts)
+    ]
+  end
+
+  # Prometheus exporter doesn't support summaries, so provide fallbacks
+  defp summary_fallback_metrics(byte_unit \\ :byte) do
+    # Summary metrics are not supported by the Prometheus exporter
+    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/11
+    # and sum metrics currently only work with integers
+    #   https://github.com/beam-telemetry/telemetry_metrics_prometheus_core/issues/35
+    #
+    # For VM metrics this is kindof ok as they appear to always be integers
+    # and we can use sum + counter to get the average between polls from their change
+    # But for repo query times we need to use a full distribution
+
+    simple_buckets = [0, 1, 2, 4, 8, 16]
+    simple_buckets_quick = for t <- simple_buckets, do: t / 100.0
+
+    # Already included in distribution metrics anyway:
+    #   phoenix.router_dispatch.stop.duration
+    #   pleroma.repo.query.total_time
+    #   pleroma.repo.query.queue_time
+    dist_metrics =
+      [
+        distribution("phoenix.endpoint.stop.duration.fdist",
+          event_name: [:phoenix, :endpoint, :stop],
+          measurement: :duration,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        ),
+        distribution("pleroma.repo.query.decode_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :decode_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets_quick
+          ]
+        ),
+        distribution("pleroma.repo.query.query_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :query_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        ),
+        distribution("pleroma.repo.query.idle_time.fdist",
+          event_name: [:pleroma, :repo, :query],
+          measurement: :idle_time,
+          unit: {:native, :millisecond},
+          reporter_options: [
+            buckets: simple_buckets
+          ]
+        )
+      ]
+
+    vm_metrics =
+      sum_counter_pair("vm.memory.total",
+        event_name: [:vm, :memory],
+        measurement: :total,
+        unit: {:byte, byte_unit}
+      ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.total",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :total
+        ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.cpu",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :cpu
+        ) ++
+        sum_counter_pair("vm.total_run_queue_lengths.io.fsum",
+          event_name: [:vm, :total_run_queue_lengths],
+          measurement: :io
+        )
+
+    dist_metrics ++ vm_metrics
+  end
+
   defp common_metrics do
     [
       last_value("pleroma.local_users.total"),
@@ -135,7 +218,9 @@ defp common_metrics do
     ]
   end
 
-  def prometheus_metrics, do: common_metrics() ++ distribution_metrics()
+  def prometheus_metrics,
+    do: common_metrics() ++ distribution_metrics() ++ summary_fallback_metrics()
+
   def live_dashboard_metrics, do: common_metrics() ++ summary_metrics(:megabyte)
 
   defp periodic_measurements do