forked from AkkomaGang/akkoma
Improves RetryQueue behavior
reduces to one single timer firing once a second switches to a parallel worker model
This commit is contained in:
parent
f5d7b0003e
commit
cb286fdeba
2 changed files with 202 additions and 24 deletions
|
@ -7,20 +7,34 @@ defmodule Pleroma.Web.Federator.RetryQueue do
|
|||
|
||||
require Logger
|
||||
|
||||
# initial timeout, 5 min
|
||||
@initial_timeout 30_000
|
||||
# seconds
|
||||
@initial_timeout 30
|
||||
@max_retries 5
|
||||
|
||||
@max_jobs 20
|
||||
|
||||
def init(args) do
|
||||
{:ok, args}
|
||||
queue_table = :ets.new(:pleroma_retry_queue, [:bag, :protected])
|
||||
|
||||
{:ok, %{args | queue_table: queue_table, running_jobs: :sets.new()}}
|
||||
end
|
||||
|
||||
def start_link() do
|
||||
enabled = Pleroma.Config.get([:retry_queue, :enabled], false)
|
||||
enabled =
|
||||
if Mix.env() == :test, do: true, else: Pleroma.Config.get([:retry_queue, :enabled], false)
|
||||
|
||||
if enabled do
|
||||
Logger.info("Starting retry queue")
|
||||
GenServer.start_link(__MODULE__, %{delivered: 0, dropped: 0}, name: __MODULE__)
|
||||
|
||||
linkres =
|
||||
GenServer.start_link(
|
||||
__MODULE__,
|
||||
%{delivered: 0, dropped: 0, queue_table: nil, running_jobs: nil},
|
||||
name: __MODULE__
|
||||
)
|
||||
|
||||
maybe_kickoff_timer()
|
||||
linkres
|
||||
else
|
||||
Logger.info("Retry queue disabled")
|
||||
:ignore
|
||||
|
@ -31,6 +45,14 @@ def enqueue(data, transport, retries \\ 0) do
|
|||
GenServer.cast(__MODULE__, {:maybe_enqueue, data, transport, retries + 1})
|
||||
end
|
||||
|
||||
def get_stats() do
|
||||
GenServer.call(__MODULE__, :get_stats)
|
||||
end
|
||||
|
||||
def reset_stats() do
|
||||
GenServer.call(__MODULE__, :reset_stats)
|
||||
end
|
||||
|
||||
def get_retry_params(retries) do
|
||||
if retries > @max_retries do
|
||||
{:drop, "Max retries reached"}
|
||||
|
@ -39,16 +61,118 @@ def get_retry_params(retries) do
|
|||
end
|
||||
end
|
||||
|
||||
def handle_cast({:maybe_enqueue, data, transport, retries}, %{dropped: drop_count} = state) do
|
||||
case get_retry_params(retries) do
|
||||
{:retry, timeout} ->
|
||||
Process.send_after(
|
||||
__MODULE__,
|
||||
{:send, data, transport, retries},
|
||||
timeout
|
||||
def get_retry_timer_interval() do
|
||||
Pleroma.Config.get([:retry_queue, :interval], 1000)
|
||||
end
|
||||
|
||||
defp ets_count_expires(table, current_time) do
|
||||
:ets.select_count(
|
||||
table,
|
||||
[
|
||||
{
|
||||
{:"$1", :"$2"},
|
||||
[{:"=<", :"$1", {:const, current_time}}],
|
||||
[true]
|
||||
}
|
||||
]
|
||||
)
|
||||
end
|
||||
|
||||
defp ets_pop_n_expired(table, current_time, desired) do
|
||||
{popped, _continuation} =
|
||||
:ets.select(
|
||||
table,
|
||||
[
|
||||
{
|
||||
{:"$1", :"$2"},
|
||||
[{:"=<", :"$1", {:const, current_time}}],
|
||||
[:"$_"]
|
||||
}
|
||||
],
|
||||
desired
|
||||
)
|
||||
|
||||
{:noreply, state}
|
||||
popped
|
||||
|> List.foldl(true, fn e, acc ->
|
||||
:ets.delete_object(table, e)
|
||||
acc
|
||||
end)
|
||||
|
||||
popped
|
||||
end
|
||||
|
||||
def maybe_start_job(running_jobs, queue_table) do
|
||||
# we don't want to hit the ets or the DateTime more times than we have to
|
||||
# could optimize slightly further by not using the count, and instead grabbing
|
||||
# up to N objects early...
|
||||
current_time = DateTime.to_unix(DateTime.utc_now())
|
||||
n_running_jobs = :sets.size(running_jobs)
|
||||
|
||||
if n_running_jobs < @max_jobs do
|
||||
n_ready_jobs = ets_count_expires(queue_table, current_time)
|
||||
|
||||
if n_ready_jobs > 0 do
|
||||
# figure out how many we could start
|
||||
available_job_slots = @max_jobs - n_running_jobs
|
||||
start_n_jobs(running_jobs, queue_table, current_time, available_job_slots)
|
||||
else
|
||||
running_jobs
|
||||
end
|
||||
else
|
||||
running_jobs
|
||||
end
|
||||
end
|
||||
|
||||
defp start_n_jobs(running_jobs, _queue_table, _current_time, 0) do
|
||||
running_jobs
|
||||
end
|
||||
|
||||
defp start_n_jobs(running_jobs, queue_table, current_time, available_job_slots)
|
||||
when available_job_slots > 0 do
|
||||
candidates = ets_pop_n_expired(queue_table, current_time, available_job_slots)
|
||||
|
||||
candidates
|
||||
|> List.foldl(running_jobs, fn {_, e}, rj ->
|
||||
{:ok, pid} = Task.start(fn -> worker(e) end)
|
||||
mref = Process.monitor(pid)
|
||||
:sets.add_element(mref, rj)
|
||||
end)
|
||||
end
|
||||
|
||||
def worker({:send, data, transport, retries}) do
|
||||
case transport.publish_one(data) do
|
||||
{:ok, _} ->
|
||||
GenServer.cast(__MODULE__, :inc_delivered)
|
||||
:delivered
|
||||
|
||||
{:error, _reason} ->
|
||||
enqueue(data, transport, retries)
|
||||
:retry
|
||||
end
|
||||
end
|
||||
|
||||
def handle_call(:get_stats, _from, %{delivered: delivery_count, dropped: drop_count} = state) do
|
||||
{:reply, %{delivered: delivery_count, dropped: drop_count}, state}
|
||||
end
|
||||
|
||||
def handle_call(:reset_stats, _from, %{delivered: delivery_count, dropped: drop_count} = state) do
|
||||
{:reply, %{delivered: delivery_count, dropped: drop_count},
|
||||
%{state | delivered: 0, dropped: 0}}
|
||||
end
|
||||
|
||||
def handle_cast(:reset_stats, state) do
|
||||
{:noreply, %{state | delivered: 0, dropped: 0}}
|
||||
end
|
||||
|
||||
def handle_cast(
|
||||
{:maybe_enqueue, data, transport, retries},
|
||||
%{dropped: drop_count, queue_table: queue_table, running_jobs: running_jobs} = state
|
||||
) do
|
||||
case get_retry_params(retries) do
|
||||
{:retry, timeout} ->
|
||||
:ets.insert(queue_table, {timeout, {:send, data, transport, retries}})
|
||||
running_jobs = maybe_start_job(running_jobs, queue_table)
|
||||
{:noreply, %{state | running_jobs: running_jobs}}
|
||||
|
||||
{:drop, message} ->
|
||||
Logger.debug(message)
|
||||
|
@ -56,6 +180,20 @@ def handle_cast({:maybe_enqueue, data, transport, retries}, %{dropped: drop_coun
|
|||
end
|
||||
end
|
||||
|
||||
def handle_cast(:kickoff_timer, state) do
|
||||
retry_interval = get_retry_timer_interval()
|
||||
Process.send_after(__MODULE__, :retry_timer_run, retry_interval)
|
||||
{:noreply, state}
|
||||
end
|
||||
|
||||
def handle_cast(:inc_delivered, %{delivered: delivery_count} = state) do
|
||||
{:noreply, %{state | delivered: delivery_count + 1}}
|
||||
end
|
||||
|
||||
def handle_cast(:inc_dropped, %{dropped: drop_count} = state) do
|
||||
{:noreply, %{state | dropped: drop_count + 1}}
|
||||
end
|
||||
|
||||
def handle_info({:send, data, transport, retries}, %{delivered: delivery_count} = state) do
|
||||
case transport.publish_one(data) do
|
||||
{:ok, _} ->
|
||||
|
@ -67,12 +205,39 @@ def handle_info({:send, data, transport, retries}, %{delivered: delivery_count}
|
|||
end
|
||||
end
|
||||
|
||||
def handle_info(
|
||||
:retry_timer_run,
|
||||
%{queue_table: queue_table, running_jobs: running_jobs} = state
|
||||
) do
|
||||
maybe_kickoff_timer()
|
||||
running_jobs = maybe_start_job(running_jobs, queue_table)
|
||||
{:noreply, %{state | running_jobs: running_jobs}}
|
||||
end
|
||||
|
||||
def handle_info({:DOWN, ref, :process, _pid, _reason}, state) do
|
||||
%{running_jobs: running_jobs, queue_table: queue_table} = state
|
||||
running_jobs = :sets.del_element(ref, running_jobs)
|
||||
running_jobs = maybe_start_job(running_jobs, queue_table)
|
||||
{:noreply, %{state | running_jobs: running_jobs}}
|
||||
end
|
||||
|
||||
def handle_info(unknown, state) do
|
||||
Logger.debug("RetryQueue: don't know what to do with #{inspect(unknown)}, ignoring")
|
||||
{:noreply, state}
|
||||
end
|
||||
|
||||
if Mix.env() == :test do
|
||||
defp growth_function(_retries) do
|
||||
_shutit = @initial_timeout
|
||||
DateTime.to_unix(DateTime.utc_now()) - 1
|
||||
end
|
||||
else
|
||||
defp growth_function(retries) do
|
||||
round(@initial_timeout * :math.pow(retries, 3))
|
||||
round(@initial_timeout * :math.pow(retries, 3)) + DateTime.to_unix(DateTime.utc_now())
|
||||
end
|
||||
end
|
||||
|
||||
defp maybe_kickoff_timer() do
|
||||
GenServer.cast(__MODULE__, :kickoff_timer)
|
||||
end
|
||||
end
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
# SPDX-License-Identifier: AGPL-3.0-only
|
||||
|
||||
defmodule MockActivityPub do
|
||||
def publish_one(ret) do
|
||||
def publish_one({ret, waiter}) do
|
||||
send(waiter, :complete)
|
||||
{ret, "success"}
|
||||
end
|
||||
end
|
||||
|
@ -15,21 +16,33 @@ defmodule Pleroma.Web.Federator.RetryQueueTest do
|
|||
@small_retry_count 0
|
||||
@hopeless_retry_count 10
|
||||
|
||||
setup do
|
||||
RetryQueue.reset_stats()
|
||||
end
|
||||
|
||||
test "RetryQueue responds to stats request" do
|
||||
assert %{delivered: 0, dropped: 0} == RetryQueue.get_stats()
|
||||
end
|
||||
|
||||
test "failed posts are retried" do
|
||||
{:retry, _timeout} = RetryQueue.get_retry_params(@small_retry_count)
|
||||
|
||||
assert {:noreply, %{delivered: 1}} ==
|
||||
RetryQueue.handle_info({:send, :ok, MockActivityPub, @small_retry_count}, %{
|
||||
delivered: 0
|
||||
})
|
||||
wait_task =
|
||||
Task.async(fn ->
|
||||
receive do
|
||||
:complete -> :ok
|
||||
end
|
||||
end)
|
||||
|
||||
RetryQueue.enqueue({:ok, wait_task.pid}, MockActivityPub, @small_retry_count)
|
||||
Task.await(wait_task)
|
||||
assert %{delivered: 1, dropped: 0} == RetryQueue.get_stats()
|
||||
end
|
||||
|
||||
test "posts that have been tried too many times are dropped" do
|
||||
{:drop, _timeout} = RetryQueue.get_retry_params(@hopeless_retry_count)
|
||||
|
||||
assert {:noreply, %{dropped: 1}} ==
|
||||
RetryQueue.handle_cast({:maybe_enqueue, %{}, nil, @hopeless_retry_count}, %{
|
||||
dropped: 0
|
||||
})
|
||||
RetryQueue.enqueue({:ok, nil}, MockActivityPub, @hopeless_retry_count)
|
||||
assert %{delivered: 0, dropped: 1} == RetryQueue.get_stats()
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue