From bc9e76cce78eaaa16d651432d23499a1cd9dfbd9 Mon Sep 17 00:00:00 2001 From: FloatingGhost Date: Thu, 30 Jun 2022 17:36:57 +0100 Subject: [PATCH] Add documentation for ES search --- config/description.exs | 85 +++++++++++ docs/configuration/search.md | 40 +++++ lib/mix/tasks/pleroma/search/elasticsearch.ex | 9 ++ lib/mix/tasks/pleroma/search/meilisearch.ex | 144 ++++++++++++++++++ 4 files changed, 278 insertions(+) create mode 100644 lib/mix/tasks/pleroma/search/elasticsearch.ex create mode 100644 lib/mix/tasks/pleroma/search/meilisearch.ex diff --git a/config/description.exs b/config/description.exs index 2d068556f..ac3faa346 100644 --- a/config/description.exs +++ b/config/description.exs @@ -3472,5 +3472,90 @@ suggestion: [100_000] } ] + }, + %{ + group: :pleroma, + key: Pleroma.Search.Elasticsearch.Cluster, + type: :group, + description: "Elasticsearch settings.", + children: [ + %{ + key: :url, + type: :string, + description: "Elasticsearch URL.", + suggestion: ["http://127.0.0.1:9200/"] + }, + %{ + key: :username, + type: :string, + description: "Username to connect to ES. Set to nil if your cluster is unauthenticated.", + suggestion: ["elastic"] + }, + %{ + key: :password, + type: :string, + description: "Password to connect to ES. Set to nil if your cluster is unauthenticated.", + suggestion: ["changeme"] + }, + %{ + key: :api, + type: :module, + description: + "The API module used by Elasticsearch. Should always be Elasticsearch.API.HTTP", + suggestion: [Elasticsearch.API.HTTP] + }, + %{ + key: :json_library, + type: :module, + description: + "The JSON module used to encode/decode when communicating with Elasticsearch", + suggestion: [Jason] + }, + %{ + key: :indexes, + type: :map, + description: "The indices to set up in Elasticsearch", + children: [ + %{ + key: :activities, + type: :map, + description: "Config for the index to use for activities", + children: [ + %{ + key: :settings, + type: :string, + description: + "Path to the file containing index settings for the activities index. Should contain a mapping.", + suggestion: ["priv/es-mappings/activity.json"] + }, + %{ + key: :store, + type: :module, + description: "The internal store module", + suggestion: [Pleroma.Search.Elasticsearch.Store] + }, + %{ + key: :sources, + type: {:list, :module}, + description: "The internal types to use for this index", + suggestion: [[Pleroma.Activity]] + }, + %{ + key: :bulk_page_size, + type: :int, + description: "Size for bulk put requests, mostly used on building the index", + suggestion: [5000] + }, + %{ + key: :bulk_wait_interval, + type: :int, + description: "Time to wait between bulk put requests (in ms)", + suggestion: [15_000] + } + ] + } + ] + } + ] } ] diff --git a/docs/configuration/search.md b/docs/configuration/search.md index f131948a7..7c1093ab9 100644 --- a/docs/configuration/search.md +++ b/docs/configuration/search.md @@ -121,3 +121,43 @@ This will clear **all** the posts from the search index. Note, that deleted post there is no need to actually clear the whole index, unless you want **all** of it gone. That said, the index does not hold any information that cannot be re-created from the database, it should also generally be a lot smaller than the size of your database. Still, the size depends on the amount of text in posts. + +## Elasticsearch + +As with meilisearch, this can be rather memory-hungry, but it is very good at what it does. + +To use [elasticsearch](https://www.elastic.co/), set the search module to `Pleroma.Search.Elasticsearch`: + +> config :pleroma, Pleroma.Search, module: Pleroma.Search.Elasticsearch + +You then need to set the URL and authentication credentials if relevant. + +> config :pleroma, Pleroma.Search.Elasticsearch.Cluster, +> url: "http://127.0.0.1:9200/", +> username: "elastic", +> password: "changeme", + +### Initial indexing + +After setting up the configuration, you'll want to index all of your already existsing posts. Only public posts are indexed. You'll only +have to do it one time, but it might take a while, depending on the amount of posts your instance has seen. + +The sequence of actions is as follows: + +1. First, change the configuration to use `Pleroma.Search.Elasticsearch` as the search backend +2. Restart your instance, at this point it can be used while the search indexing is running, though search won't return anything +3. Start the initial indexing process (as described below with `index`), + and wait until the task says it sent everything from the database to index +4. Wait until the index tasks exits + +To start the initial indexing, run the `build` command: + +=== "OTP" +```sh +./bin/pleroma_ctl search.elasticsearch index activities --cluster Pleroma.Search.Elasticsearch.Cluster +``` + +=== "From Source" +```sh +mix elasticsearch.build activities --cluster Pleroma.Search.Elasticsearch.Cluster +``` \ No newline at end of file diff --git a/lib/mix/tasks/pleroma/search/elasticsearch.ex b/lib/mix/tasks/pleroma/search/elasticsearch.ex new file mode 100644 index 000000000..1d7d7a29a --- /dev/null +++ b/lib/mix/tasks/pleroma/search/elasticsearch.ex @@ -0,0 +1,9 @@ +defmodule Mix.Tasks.Pleroma.Search.Elasticsearch do + alias Mix.Tasks.Elasticsearch.Build + import Mix.Pleroma + + def run(["index" | args]) do + start_pleroma() + Build.run(args) + end +end diff --git a/lib/mix/tasks/pleroma/search/meilisearch.ex b/lib/mix/tasks/pleroma/search/meilisearch.ex new file mode 100644 index 000000000..d4a83c3cd --- /dev/null +++ b/lib/mix/tasks/pleroma/search/meilisearch.ex @@ -0,0 +1,144 @@ +# Pleroma: A lightweight social networking server +# Copyright © 2017-2021 Pleroma Authors +# SPDX-License-Identifier: AGPL-3.0-only + +defmodule Mix.Tasks.Pleroma.Search.Meilisearch do + require Pleroma.Constants + + import Mix.Pleroma + import Ecto.Query + + import Pleroma.Search.Meilisearch, + only: [meili_post: 2, meili_put: 2, meili_get: 1, meili_delete!: 1] + + def run(["index"]) do + start_pleroma() + + meili_version = + ( + {:ok, result} = meili_get("/version") + + result["pkgVersion"] + ) + + # The ranking rule syntax was changed but nothing about that is mentioned in the changelog + if not Version.match?(meili_version, ">= 0.25.0") do + raise "Meilisearch <0.24.0 not supported" + end + + {:ok, _} = + meili_post( + "/indexes/objects/settings/ranking-rules", + [ + "published:desc", + "words", + "exactness", + "proximity", + "typo", + "attribute", + "sort" + ] + ) + + {:ok, _} = + meili_post( + "/indexes/objects/settings/searchable-attributes", + [ + "content" + ] + ) + + IO.puts("Created indices. Starting to insert posts.") + + chunk_size = Pleroma.Config.get([Pleroma.Search.Meilisearch, :initial_indexing_chunk_size]) + + Pleroma.Repo.transaction( + fn -> + query = + from(Pleroma.Object, + # Only index public and unlisted posts which are notes and have some text + where: + fragment("data->>'type' = 'Note'") and + (fragment("data->'to' \\? ?", ^Pleroma.Constants.as_public()) or + fragment("data->'cc' \\? ?", ^Pleroma.Constants.as_public())), + order_by: [desc: fragment("data->'published'")] + ) + + count = query |> Pleroma.Repo.aggregate(:count, :data) + IO.puts("Entries to index: #{count}") + + Pleroma.Repo.stream( + query, + timeout: :infinity + ) + |> Stream.map(&Pleroma.Search.Meilisearch.object_to_search_data/1) + |> Stream.filter(fn o -> not is_nil(o) end) + |> Stream.chunk_every(chunk_size) + |> Stream.transform(0, fn objects, acc -> + new_acc = acc + Enum.count(objects) + + # Reset to the beginning of the line and rewrite it + IO.write("\r") + IO.write("Indexed #{new_acc} entries") + + {[objects], new_acc} + end) + |> Stream.each(fn objects -> + result = + meili_put( + "/indexes/objects/documents", + objects + ) + + with {:ok, res} <- result do + if not Map.has_key?(res, "uid") do + IO.puts("\nFailed to index: #{inspect(result)}") + end + else + e -> IO.puts("\nFailed to index due to network error: #{inspect(e)}") + end + end) + |> Stream.run() + end, + timeout: :infinity + ) + + IO.write("\n") + end + + def run(["clear"]) do + start_pleroma() + + meili_delete!("/indexes/objects/documents") + end + + def run(["show-keys", master_key]) do + start_pleroma() + + endpoint = Pleroma.Config.get([Pleroma.Search.Meilisearch, :url]) + + {:ok, result} = + Pleroma.HTTP.get( + Path.join(endpoint, "/keys"), + [{"Authorization", "Bearer #{master_key}"}] + ) + + decoded = Jason.decode!(result.body) + + if decoded["results"] do + Enum.each(decoded["results"], fn %{"description" => desc, "key" => key} -> + IO.puts("#{desc}: #{key}") + end) + else + IO.puts("Error fetching the keys, check the master key is correct: #{inspect(decoded)}") + end + end + + def run(["stats"]) do + start_pleroma() + + {:ok, result} = meili_get("/indexes/objects/stats") + IO.puts("Number of entries: #{result["numberOfDocuments"]}") + IO.puts("Indexing? #{result["isIndexing"]}") + end +end