diff --git a/app/chewy/accounts_index.rb b/app/chewy/accounts_index.rb index a3e26dc07..22908ea68 100644 --- a/app/chewy/accounts_index.rb +++ b/app/chewy/accounts_index.rb @@ -1,41 +1,28 @@ # frozen_string_literal: true class AccountsIndex < Chewy::Index - settings index: { - refresh_interval: '5m', - number_of_shards: 1, - number_of_replicas: 0, - }, - analysis: { - analyzer: { - content: { - tokenizer: 'whitespace', - filter: %w(lowercase asciifolding cjk_width), + settings index: { refresh_interval: '5m' }, analysis: { + filter: { + english_stop: { + type: 'stop', + stopwords: '_english_', }, - - edge_ngram: { - tokenizer: 'edge_ngram', - filter: %w(lowercase asciifolding cjk_width), + english_stemmer: { + type: 'stemmer', + language: 'english', }, - - sudachi_content: { - tokenizer: 'sudachi_tokenizer', - type: 'custom', - filter: %w( - lowercase - cjk_width - sudachi_part_of_speech - sudachi_ja_stop - sudachi_baseform - search - ), + english_possessive_stemmer: { + type: 'stemmer', + language: 'possessive_english', }, }, - normalizer: { - keyword: { - type: 'custom', - filter: %w(lowercase asciifolding cjk_width), + char_filter: { + tsconvert: { + type: 'stconvert', + keep_both: false, + delimiter: '#', + convert_type: 't2s', }, }, @@ -46,18 +33,101 @@ class AccountsIndex < Chewy::Index max_gram: 15, }, - sudachi_tokenizer: { - type: 'sudachi_tokenizer', - discard_punctuation: true, - resources_path: '/etc/elasticsearch/sudachi', - settings_path: '/etc/elasticsearch/sudachi/sudachi.json', + kuromoji_user_dict: { + type: 'kuromoji_tokenizer', + user_dictionary: 'userdic.txt', + }, + + nori_user_dict: { + type: 'nori_tokenizer', + decompound_mode: 'mixed', }, }, - filter: { - search: { - type: 'sudachi_split', - mode: 'search', + analyzer: { + title: { + tokenizer: 'whitespace', + filter: %w(lowercase asciifolding cjk_width), + }, + + ja_title: { + type: 'custom', + char_filter: %w( + icu_normalizer + kuromoji_iteration_mark + ), + tokenizer: 'kuromoji_user_dict', + filter: %w(lowercase asciifolding cjk_width), + }, + + ko_title: { + tokenizer: 'nori_user_dict', + filter: %w(lowercase asciifolding cjk_width), + }, + + zh_title: { + tokenizer: 'ik_max_word', + filter: %w(lowercase asciifolding cjk_width), + }, + + content: { + tokenizer: 'uax_url_email', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + }, + + ja_content: { + type: 'custom', + char_filter: %w( + icu_normalizer + kuromoji_iteration_mark + ), + tokenizer: 'kuromoji_user_dict', + filter: %w( + kuromoji_baseform + kuromoji_part_of_speech + ja_stop + kuromoji_stemmer + kuromoji_number + cjk_width + lowercase + ), + }, + + ko_content: { + tokenizer: 'nori_user_dict', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + }, + + zh_content: { + tokenizer: 'ik_max_word', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + char_filter: %w(tsconvert), + }, + + edge_ngram: { + tokenizer: 'edge_ngram', + filter: %w(lowercase asciifolding cjk_width), }, }, } @@ -67,19 +137,24 @@ class AccountsIndex < Chewy::Index root date_detection: false do field :id, type: 'long' - field :display_name, type: 'text', analyzer: 'content' do - field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + field :display_name, type: 'text', analyzer: 'title' do + field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'title' + field :ja_stemmed, type: 'text', analyzer: 'ja_title', search_analyzer: 'title' + field :ko_stemmed, type: 'text', analyzer: 'ko_title', search_analyzer: 'title' + field :zh_stemmed, type: 'text', analyzer: 'zh_title', search_analyzer: 'title' end - field :acct, type: 'text', analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') } do - field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + field :acct, type: 'text', analyzer: 'title', value: ->(account) { [account.username, account.domain].compact.join('@') } do + field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'title' end field :actor_type, type: 'keyword', normalizer: 'keyword' field :text, type: 'text', value: ->(account) { account.index_text } do - field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' - field :stemmed, type: 'text', analyzer: 'sudachi_content' + field :en_stemmed, type: 'text', analyzer: 'content' + field :ja_stemmed, type: 'text', analyzer: 'ja_content' + field :ko_stemmed, type: 'text', analyzer: 'ko_content' + field :zh_stemmed, type: 'text', analyzer: 'zh_content' end field :discoverable, type: 'boolean' diff --git a/app/chewy/statuses_index.rb b/app/chewy/statuses_index.rb index 60b1894e1..467e81b65 100644 --- a/app/chewy/statuses_index.rb +++ b/app/chewy/statuses_index.rb @@ -1,38 +1,99 @@ # frozen_string_literal: true class StatusesIndex < Chewy::Index - settings index: { - refresh_interval: '15m', - number_of_shards: 1, - number_of_replicas: 0, - }, - analysis: { - tokenizer: { - sudachi_tokenizer: { - type: 'sudachi_tokenizer', - discard_punctuation: true, - resources_path: '/etc/elasticsearch/sudachi', - settings_path: '/etc/elasticsearch/sudachi/sudachi.json', - }, - }, - analyzer: { - content: { - filter: %w( - lowercase - cjk_width - sudachi_part_of_speech - sudachi_ja_stop - sudachi_baseform - search - ), - tokenizer: 'sudachi_tokenizer', - type: 'custom', - }, - }, + settings index: { refresh_interval: '15m' }, analysis: { filter: { - search: { - type: 'sudachi_split', - mode: 'search', + english_stop: { + type: 'stop', + stopwords: '_english_', + }, + + english_stemmer: { + type: 'stemmer', + language: 'english', + }, + + english_possessive_stemmer: { + type: 'stemmer', + language: 'possessive_english', + }, + }, + + char_filter: { + tsconvert: { + type: 'stconvert', + keep_both: false, + delimiter: '#', + convert_type: 't2s', + }, + }, + + tokenizer: { + kuromoji_user_dict: { + type: 'kuromoji_tokenizer', + user_dictionary: 'userdic.txt', + }, + + nori_user_dict: { + type: 'nori_tokenizer', + decompound_mode: 'mixed', + }, + }, + + analyzer: { + en_content: { + tokenizer: 'uax_url_email', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + }, + + ja_content: { + type: 'custom', + char_filter: %w( + icu_normalizer + kuromoji_iteration_mark + ), + tokenizer: 'kuromoji_user_dict', + filter: %w( + kuromoji_baseform + kuromoji_part_of_speech + ja_stop + kuromoji_stemmer + kuromoji_number + cjk_width + lowercase + ), + }, + + ko_content: { + tokenizer: 'nori_user_dict', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + }, + + zh_content: { + tokenizer: 'ik_max_word', + filter: %w( + english_possessive_stemmer + lowercase + asciifolding + cjk_width + english_stop + english_stemmer + ), + char_filter: %w(tsconvert), }, }, } @@ -59,6 +120,11 @@ class StatusesIndex < Chewy::Index data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } end + crutch :votes do |collection| + data = ::PollVote.joins(:poll).where(poll: { status_id: collection.map(&:id) }).where(account: Account.local).pluck(:status_id, :account_id) + data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } + end + crutch :emoji_reactions do |collection| data = ::EmojiReaction.where(status_id: collection.map(&:id)).where(account: Account.local).pluck(:status_id, :account_id) data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) } @@ -72,11 +138,28 @@ class StatusesIndex < Chewy::Index root date_detection: false do field :id, type: 'long' field :account_id, type: 'long' + field :domain, type: 'keyword', value: ->(status) { status.account_domain } field :text, type: 'text', value: ->(status) { status.index_text } do - field :stemmed, type: 'text', analyzer: 'content' + field :en_stemmed, type: 'text', analyzer: 'en_content' + field :ja_stemmed, type: 'text', analyzer: 'ja_content' + field :ko_stemmed, type: 'text', analyzer: 'ko_content' + field :zh_stemmed, type: 'text', analyzer: 'zh_content' end + field :mentioned_account_id, type: 'long' + field :tag_id, type: 'long' + field :media_type, type: 'keyword' + field :reference_type, type: 'keyword' + field :language, type: 'keyword' + + field :replies_count, type: 'long' + field :reblogs_count, type: 'long' + field :favourites_count, type: 'long' + field :emoji_reactions_count, type: 'long' + field :status_referred_by_count, type: 'long' + + field :visibility, type: 'keyword' field :searchable_by, type: 'long', value: ->(status, crutches) { status.searchable_by(crutches) } field :searchability, type: 'keyword', value: ->(status) { status.compute_searchability } end diff --git a/app/chewy/tags_index.rb b/app/chewy/tags_index.rb index a5f382b9e..0e66e9533 100644 --- a/app/chewy/tags_index.rb +++ b/app/chewy/tags_index.rb @@ -1,18 +1,48 @@ # frozen_string_literal: true class TagsIndex < Chewy::Index - settings index: { - refresh_interval: '15m', - number_of_shards: 1, - number_of_replicas: 0, - }, - analysis: { + settings index: { refresh_interval: '15m' }, analysis: { + char_filter: { + tsconvert: { + type: 'stconvert', + keep_both: false, + delimiter: '#', + convert_type: 't2s', + }, + }, + analyzer: { content: { tokenizer: 'keyword', filter: %w(lowercase asciifolding cjk_width), }, + ja_content: { + type: 'custom', + char_filter: %w(icu_normalizer kuromoji_iteration_mark), + tokenizer: 'kuromoji_user_dict', + filter: %w( + kuromoji_baseform + kuromoji_part_of_speech + ja_stop + kuromoji_stemmer + kuromoji_number + cjk_width + lowercase + ), + }, + + ko_content: { + tokenizer: 'nori_user_dict', + filter: %w(lowercase asciifolding cjk_width), + }, + + zh_content: { + tokenizer: 'ik_max_word', + filter: %w(lowercase asciifolding cjk_width), + char_filter: %w(tsconvert), + }, + edge_ngram: { tokenizer: 'edge_ngram', filter: %w(lowercase asciifolding cjk_width), @@ -25,6 +55,16 @@ class TagsIndex < Chewy::Index min_gram: 2, max_gram: 15, }, + + kuromoji_user_dict: { + type: 'kuromoji_tokenizer', + user_dictionary: 'userdic.txt', + }, + + nori_user_dict: { + type: 'nori_tokenizer', + decompound_mode: 'mixed', + }, }, } @@ -33,6 +73,9 @@ class TagsIndex < Chewy::Index root date_detection: false do field :name, type: 'text', analyzer: 'content' do field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content' + field :ja_stemmed, type: 'text', analyzer: 'ja_content', search_analyzer: 'content' + field :ko_stemmed, type: 'text', analyzer: 'ko_content', search_analyzer: 'content' + field :zh_stemmed, type: 'text', analyzer: 'zh_content', search_analyzer: 'content' end field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? } diff --git a/app/controllers/api/v1/accounts/search_controller.rb b/app/controllers/api/v1/accounts/search_controller.rb index 42a06d277..54af89168 100644 --- a/app/controllers/api/v1/accounts/search_controller.rb +++ b/app/controllers/api/v1/accounts/search_controller.rb @@ -20,7 +20,8 @@ class Api::V1::Accounts::SearchController < Api::BaseController followers: truthy_param?(:followers), following: truthy_param?(:following), group_only: truthy_param?(:group_only), - offset: params[:offset] + offset: params[:offset], + language: current_user.setting_default_language ) end end diff --git a/app/controllers/api/v2/search_controller.rb b/app/controllers/api/v2/search_controller.rb index 3b9a31379..cfe98552c 100644 --- a/app/controllers/api/v2/search_controller.rb +++ b/app/controllers/api/v2/search_controller.rb @@ -26,7 +26,7 @@ class Api::V2::SearchController < Api::BaseController params[:q], current_account, limit_param(RESULTS_LIMIT), - search_params.merge(resolve: truthy_param?(:resolve), exclude_unreviewed: truthy_param?(:exclude_unreviewed)) + search_params.merge(resolve: truthy_param?(:resolve), exclude_unreviewed: truthy_param?(:exclude_unreviewed), language: current_user.setting_default_language) ) end diff --git a/app/lib/account_search_query_transformer.rb b/app/lib/account_search_query_transformer.rb index 6bac84a54..7061d00e3 100644 --- a/app/lib/account_search_query_transformer.rb +++ b/app/lib/account_search_query_transformer.rb @@ -4,7 +4,8 @@ class AccountSearchQueryTransformer < Parslet::Transform class Query attr_reader :should_clauses, :must_not_clauses, :must_clauses - def initialize(clauses) + def initialize(clauses, language) + @fields = ['text'].push(%w(ja ko zh).include?(language) ? "text.#{language}_stemmed" : 'text.en_stemmed') grouped = clauses.chunk(&:operator).to_h @should_clauses = grouped.fetch(:should, []) @must_not_clauses = grouped.fetch(:must_not, []) @@ -23,7 +24,7 @@ class AccountSearchQueryTransformer < Parslet::Transform def clause_to_query(clause) case clause when TermClause - { multi_match: { type: 'most_fields', query: clause.term, fields: ['text.edge_ngram', 'text.stemmed'] } } + { multi_match: { type: 'most_fields', query: clause.term, fields: @fields } } when PhraseClause { match_phrase: { text: { query: clause.phrase } } } else @@ -102,5 +103,5 @@ class AccountSearchQueryTransformer < Parslet::Transform end end - rule(query: sequence(:clauses)) { Query.new(clauses) } + rule(query: sequence(:clauses)) { Query.new(clauses, 'ja') } end diff --git a/app/lib/search_query_transformer.rb b/app/lib/search_query_transformer.rb index f9b16d2ab..a163c6a48 100644 --- a/app/lib/search_query_transformer.rb +++ b/app/lib/search_query_transformer.rb @@ -4,7 +4,8 @@ class SearchQueryTransformer < Parslet::Transform class Query attr_reader :should_clauses, :must_not_clauses, :must_clauses, :filter_clauses, :order_clauses - def initialize(clauses) + def initialize(clauses, language) + @fields = ['text'].push(%w(ja ko zh).include?(language) ? "text.#{language}_stemmed" : 'text.en_stemmed') grouped = clauses.chunk(&:operator).to_h @should_clauses = grouped.fetch(:should, []) @must_not_clauses = grouped.fetch(:must_not, []) @@ -27,7 +28,7 @@ class SearchQueryTransformer < Parslet::Transform def clause_to_query(clause) case clause when TermClause - { multi_match: { type: 'most_fields', query: clause.term, fields: ['text', 'text.stemmed'] } } + { multi_match: { type: 'most_fields', query: clause.term, fields: @fields } } when PhraseClause { match_phrase: { text: { query: clause.phrase } } } else @@ -94,7 +95,7 @@ class SearchQueryTransformer < Parslet::Transform class PrefixClause attr_reader :filter, :operator, :term - def initialize(prefix, term) + def initialize(prefix, operator, term) case prefix when 'from' @operator = :filter @@ -106,7 +107,7 @@ class SearchQueryTransformer < Parslet::Transform @term = account.id when 'order' - raise "Unknown order: #{term}" unless %w(asc desc).include?(term) + raise "Unknown order: #{term}" unless %w(asc desc score).include?(term) @operator = :order @term = term @@ -121,7 +122,7 @@ class SearchQueryTransformer < Parslet::Transform operator = clause[:operator]&.to_s if clause[:prefix] - PrefixClause.new(prefix, clause[:term].to_s) + PrefixClause.new(prefix, operator, clause[:term].to_s) elsif clause[:term] TermClause.new(prefix, operator, clause[:term].to_s) elsif clause[:shortcode] @@ -133,5 +134,5 @@ class SearchQueryTransformer < Parslet::Transform end end - rule(query: sequence(:clauses)) { Query.new(clauses) } + rule(query: sequence(:clauses)) { Query.new(clauses, 'ja') } end diff --git a/app/models/mention.rb b/app/models/mention.rb index 7667d748d..2b18589a7 100644 --- a/app/models/mention.rb +++ b/app/models/mention.rb @@ -14,6 +14,8 @@ class Mention < ApplicationRecord include Paginable + update_index('statuses') { status } + belongs_to :account, inverse_of: :mentions belongs_to :status diff --git a/app/models/preview_card.rb b/app/models/preview_card.rb index 7e191869a..70c2ae25d 100644 --- a/app/models/preview_card.rb +++ b/app/models/preview_card.rb @@ -37,6 +37,8 @@ class PreviewCard < ApplicationRecord self.inheritance_column = false + update_index('statuses') { statuses } + enum type: [:link, :photo, :video, :rich] has_and_belongs_to_many :statuses diff --git a/app/models/status.rb b/app/models/status.rb index 953b60889..3d60b6260 100644 --- a/app/models/status.rb +++ b/app/models/status.rb @@ -178,6 +178,7 @@ class Status < ApplicationRecord ids += favourites.where(account: Account.local).pluck(:account_id) ids += reblogs.where(account: Account.local).pluck(:account_id) ids += bookmarks.where(account: Account.local).pluck(:account_id) + ids += poll.votes.where(account: Account.local).pluck(:account_id) if poll.present? ids += emoji_reactions.where(account: Account.local).pluck(:account_id) ids += referred_by_statuses.where(account: Account.local).pluck(:account_id) else @@ -185,6 +186,7 @@ class Status < ApplicationRecord ids += preloaded.favourites[id] || [] ids += preloaded.reblogs[id] || [] ids += preloaded.bookmarks[id] || [] + ids += preloaded.votes[id] || [] ids += preloaded.emoji_reactions[id] || [] ids += preloaded.status_references[id] || [] end @@ -331,6 +333,22 @@ class Status < ApplicationRecord @index_text ||= [spoiler_text, Formatter.instance.plaintext(self)].concat(media_attachments.map(&:description)).concat(preloadable_poll ? preloadable_poll.options : []).concat(quote? ? ["QT: [#{quote.url || ActivityPub::TagManager.instance.url_for(quote)}]"] : []).filter(&:present?).join("\n\n") end + def tag_id + tags.map(&:id) + end + + def mentioned_account_id + mentions.map(&:account_id) + end + + def media_type + media_attachments&.first&.type + end + + def reference_type + preview_card&.type + end + def replies_count status_stat&.replies_count || 0 end diff --git a/app/models/status_stat.rb b/app/models/status_stat.rb index 4c8d7bad6..8de8e2683 100644 --- a/app/models/status_stat.rb +++ b/app/models/status_stat.rb @@ -17,6 +17,8 @@ # class StatusStat < ApplicationRecord + update_index('statuses') { status } + belongs_to :status, inverse_of: :status_stat after_commit :reset_parent_cache diff --git a/app/services/account_search_service.rb b/app/services/account_search_service.rb index 7f6139b53..ef69c3c73 100644 --- a/app/services/account_search_service.rb +++ b/app/services/account_search_service.rb @@ -8,6 +8,8 @@ class AccountSearchService < BaseService @query = query&.strip&.gsub(/\A@/, '') @limit = options[:limit].to_i @offset = options[:offset].to_i + @lang = options.delete(:language).to_s + @fields = %w(acct.edge_ngram acct display_name).push(%w(ja ko zh).include?(@lang) ? "display_name.#{@lang}_stemmed" : 'display_name.edge_ngram') @options = options @account = account @@ -85,7 +87,7 @@ class AccountSearchService < BaseService end def from_elasticsearch(count = false) - must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct.edge_ngram acct) : %w(acct.edge_ngram acct display_name.edge_ngram display_name), type: 'most_fields', operator: 'and' } }] + must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct.edge_ngram acct) : @fields, type: 'most_fields', operator: 'and' } }] should_clauses = [] if account diff --git a/app/services/search_service.rb b/app/services/search_service.rb index 66fd700ea..5e8639259 100644 --- a/app/services/search_service.rb +++ b/app/services/search_service.rb @@ -46,7 +46,8 @@ class SearchService < BaseService @account, limit: @limit, resolve: @resolve, - offset: @offset + offset: @offset, + language: @options[:language] ) end @@ -54,6 +55,7 @@ class SearchService < BaseService AccountSearchService.new.count( @query, @account, + language: @options[:language] ) end @@ -63,7 +65,8 @@ class SearchService < BaseService @account, limit: @limit, resolve: @resolve, - offset: @offset + offset: @offset, + language: @options[:language] ) end @@ -109,7 +112,8 @@ class SearchService < BaseService @query, limit: @limit, offset: @offset, - exclude_unreviewed: @options[:exclude_unreviewed] + exclude_unreviewed: @options[:exclude_unreviewed], + language: @options[:language] ) end @@ -146,7 +150,7 @@ class SearchService < BaseService end def account_searchable? - account_search? && (account_search_explicit_pattern? || @query.match?(/\A#{Account::USERNAME_RE}\Z/)) + account_search? end def hashtag_searchable? diff --git a/app/services/tag_search_service.rb b/app/services/tag_search_service.rb index b78d65625..2d23aa9e8 100644 --- a/app/services/tag_search_service.rb +++ b/app/services/tag_search_service.rb @@ -5,6 +5,8 @@ class TagSearchService < BaseService @query = query.strip.gsub(/\A#/, '') @offset = options.delete(:offset).to_i @limit = options.delete(:limit).to_i + @lang = options.delete(:language).to_s + @fields = ['name'].push(%w(ja ko zh).include?(@lang) ? "name.#{@lang}_stemmed" : 'name.edge_ngram') @options = options results = from_elasticsearch if Chewy.enabled? @@ -21,7 +23,7 @@ class TagSearchService < BaseService query: { multi_match: { query: @query, - fields: %w(name.edge_ngram name), + fields: @fields, type: 'most_fields', operator: 'and', },