search (WIP)

This commit is contained in:
noellabo 2023-01-27 09:04:57 +09:00
parent 211f037931
commit bbf0df5a11
14 changed files with 336 additions and 100 deletions

View File

@ -1,41 +1,28 @@
# frozen_string_literal: true
class AccountsIndex < Chewy::Index
settings index: {
refresh_interval: '5m',
number_of_shards: 1,
number_of_replicas: 0,
},
analysis: {
analyzer: {
content: {
tokenizer: 'whitespace',
filter: %w(lowercase asciifolding cjk_width),
settings index: { refresh_interval: '5m' }, analysis: {
filter: {
english_stop: {
type: 'stop',
stopwords: '_english_',
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
english_stemmer: {
type: 'stemmer',
language: 'english',
},
sudachi_content: {
tokenizer: 'sudachi_tokenizer',
type: 'custom',
filter: %w(
lowercase
cjk_width
sudachi_part_of_speech
sudachi_ja_stop
sudachi_baseform
search
),
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
},
normalizer: {
keyword: {
type: 'custom',
filter: %w(lowercase asciifolding cjk_width),
char_filter: {
tsconvert: {
type: 'stconvert',
keep_both: false,
delimiter: '#',
convert_type: 't2s',
},
},
@ -46,18 +33,101 @@ class AccountsIndex < Chewy::Index
max_gram: 15,
},
sudachi_tokenizer: {
type: 'sudachi_tokenizer',
discard_punctuation: true,
resources_path: '/etc/elasticsearch/sudachi',
settings_path: '/etc/elasticsearch/sudachi/sudachi.json',
kuromoji_user_dict: {
type: 'kuromoji_tokenizer',
user_dictionary: 'userdic.txt',
},
nori_user_dict: {
type: 'nori_tokenizer',
decompound_mode: 'mixed',
},
},
filter: {
search: {
type: 'sudachi_split',
mode: 'search',
analyzer: {
title: {
tokenizer: 'whitespace',
filter: %w(lowercase asciifolding cjk_width),
},
ja_title: {
type: 'custom',
char_filter: %w(
icu_normalizer
kuromoji_iteration_mark
),
tokenizer: 'kuromoji_user_dict',
filter: %w(lowercase asciifolding cjk_width),
},
ko_title: {
tokenizer: 'nori_user_dict',
filter: %w(lowercase asciifolding cjk_width),
},
zh_title: {
tokenizer: 'ik_max_word',
filter: %w(lowercase asciifolding cjk_width),
},
content: {
tokenizer: 'uax_url_email',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
ja_content: {
type: 'custom',
char_filter: %w(
icu_normalizer
kuromoji_iteration_mark
),
tokenizer: 'kuromoji_user_dict',
filter: %w(
kuromoji_baseform
kuromoji_part_of_speech
ja_stop
kuromoji_stemmer
kuromoji_number
cjk_width
lowercase
),
},
ko_content: {
tokenizer: 'nori_user_dict',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
zh_content: {
tokenizer: 'ik_max_word',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
char_filter: %w(tsconvert),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
},
},
}
@ -67,19 +137,24 @@ class AccountsIndex < Chewy::Index
root date_detection: false do
field :id, type: 'long'
field :display_name, type: 'text', analyzer: 'content' do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :display_name, type: 'text', analyzer: 'title' do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'title'
field :ja_stemmed, type: 'text', analyzer: 'ja_title', search_analyzer: 'title'
field :ko_stemmed, type: 'text', analyzer: 'ko_title', search_analyzer: 'title'
field :zh_stemmed, type: 'text', analyzer: 'zh_title', search_analyzer: 'title'
end
field :acct, type: 'text', analyzer: 'content', value: ->(account) { [account.username, account.domain].compact.join('@') } do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :acct, type: 'text', analyzer: 'title', value: ->(account) { [account.username, account.domain].compact.join('@') } do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'title'
end
field :actor_type, type: 'keyword', normalizer: 'keyword'
field :text, type: 'text', value: ->(account) { account.index_text } do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :stemmed, type: 'text', analyzer: 'sudachi_content'
field :en_stemmed, type: 'text', analyzer: 'content'
field :ja_stemmed, type: 'text', analyzer: 'ja_content'
field :ko_stemmed, type: 'text', analyzer: 'ko_content'
field :zh_stemmed, type: 'text', analyzer: 'zh_content'
end
field :discoverable, type: 'boolean'

View File

@ -1,38 +1,99 @@
# frozen_string_literal: true
class StatusesIndex < Chewy::Index
settings index: {
refresh_interval: '15m',
number_of_shards: 1,
number_of_replicas: 0,
},
analysis: {
tokenizer: {
sudachi_tokenizer: {
type: 'sudachi_tokenizer',
discard_punctuation: true,
resources_path: '/etc/elasticsearch/sudachi',
settings_path: '/etc/elasticsearch/sudachi/sudachi.json',
},
},
analyzer: {
content: {
filter: %w(
lowercase
cjk_width
sudachi_part_of_speech
sudachi_ja_stop
sudachi_baseform
search
),
tokenizer: 'sudachi_tokenizer',
type: 'custom',
},
},
settings index: { refresh_interval: '15m' }, analysis: {
filter: {
search: {
type: 'sudachi_split',
mode: 'search',
english_stop: {
type: 'stop',
stopwords: '_english_',
},
english_stemmer: {
type: 'stemmer',
language: 'english',
},
english_possessive_stemmer: {
type: 'stemmer',
language: 'possessive_english',
},
},
char_filter: {
tsconvert: {
type: 'stconvert',
keep_both: false,
delimiter: '#',
convert_type: 't2s',
},
},
tokenizer: {
kuromoji_user_dict: {
type: 'kuromoji_tokenizer',
user_dictionary: 'userdic.txt',
},
nori_user_dict: {
type: 'nori_tokenizer',
decompound_mode: 'mixed',
},
},
analyzer: {
en_content: {
tokenizer: 'uax_url_email',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
ja_content: {
type: 'custom',
char_filter: %w(
icu_normalizer
kuromoji_iteration_mark
),
tokenizer: 'kuromoji_user_dict',
filter: %w(
kuromoji_baseform
kuromoji_part_of_speech
ja_stop
kuromoji_stemmer
kuromoji_number
cjk_width
lowercase
),
},
ko_content: {
tokenizer: 'nori_user_dict',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
},
zh_content: {
tokenizer: 'ik_max_word',
filter: %w(
english_possessive_stemmer
lowercase
asciifolding
cjk_width
english_stop
english_stemmer
),
char_filter: %w(tsconvert),
},
},
}
@ -59,6 +120,11 @@ class StatusesIndex < Chewy::Index
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
end
crutch :votes do |collection|
data = ::PollVote.joins(:poll).where(poll: { status_id: collection.map(&:id) }).where(account: Account.local).pluck(:status_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
end
crutch :emoji_reactions do |collection|
data = ::EmojiReaction.where(status_id: collection.map(&:id)).where(account: Account.local).pluck(:status_id, :account_id)
data.each.with_object({}) { |(id, name), result| (result[id] ||= []).push(name) }
@ -72,11 +138,28 @@ class StatusesIndex < Chewy::Index
root date_detection: false do
field :id, type: 'long'
field :account_id, type: 'long'
field :domain, type: 'keyword', value: ->(status) { status.account_domain }
field :text, type: 'text', value: ->(status) { status.index_text } do
field :stemmed, type: 'text', analyzer: 'content'
field :en_stemmed, type: 'text', analyzer: 'en_content'
field :ja_stemmed, type: 'text', analyzer: 'ja_content'
field :ko_stemmed, type: 'text', analyzer: 'ko_content'
field :zh_stemmed, type: 'text', analyzer: 'zh_content'
end
field :mentioned_account_id, type: 'long'
field :tag_id, type: 'long'
field :media_type, type: 'keyword'
field :reference_type, type: 'keyword'
field :language, type: 'keyword'
field :replies_count, type: 'long'
field :reblogs_count, type: 'long'
field :favourites_count, type: 'long'
field :emoji_reactions_count, type: 'long'
field :status_referred_by_count, type: 'long'
field :visibility, type: 'keyword'
field :searchable_by, type: 'long', value: ->(status, crutches) { status.searchable_by(crutches) }
field :searchability, type: 'keyword', value: ->(status) { status.compute_searchability }
end

View File

@ -1,18 +1,48 @@
# frozen_string_literal: true
class TagsIndex < Chewy::Index
settings index: {
refresh_interval: '15m',
number_of_shards: 1,
number_of_replicas: 0,
},
analysis: {
settings index: { refresh_interval: '15m' }, analysis: {
char_filter: {
tsconvert: {
type: 'stconvert',
keep_both: false,
delimiter: '#',
convert_type: 't2s',
},
},
analyzer: {
content: {
tokenizer: 'keyword',
filter: %w(lowercase asciifolding cjk_width),
},
ja_content: {
type: 'custom',
char_filter: %w(icu_normalizer kuromoji_iteration_mark),
tokenizer: 'kuromoji_user_dict',
filter: %w(
kuromoji_baseform
kuromoji_part_of_speech
ja_stop
kuromoji_stemmer
kuromoji_number
cjk_width
lowercase
),
},
ko_content: {
tokenizer: 'nori_user_dict',
filter: %w(lowercase asciifolding cjk_width),
},
zh_content: {
tokenizer: 'ik_max_word',
filter: %w(lowercase asciifolding cjk_width),
char_filter: %w(tsconvert),
},
edge_ngram: {
tokenizer: 'edge_ngram',
filter: %w(lowercase asciifolding cjk_width),
@ -25,6 +55,16 @@ class TagsIndex < Chewy::Index
min_gram: 2,
max_gram: 15,
},
kuromoji_user_dict: {
type: 'kuromoji_tokenizer',
user_dictionary: 'userdic.txt',
},
nori_user_dict: {
type: 'nori_tokenizer',
decompound_mode: 'mixed',
},
},
}
@ -33,6 +73,9 @@ class TagsIndex < Chewy::Index
root date_detection: false do
field :name, type: 'text', analyzer: 'content' do
field :edge_ngram, type: 'text', analyzer: 'edge_ngram', search_analyzer: 'content'
field :ja_stemmed, type: 'text', analyzer: 'ja_content', search_analyzer: 'content'
field :ko_stemmed, type: 'text', analyzer: 'ko_content', search_analyzer: 'content'
field :zh_stemmed, type: 'text', analyzer: 'zh_content', search_analyzer: 'content'
end
field :reviewed, type: 'boolean', value: ->(tag) { tag.reviewed? }

View File

@ -20,7 +20,8 @@ class Api::V1::Accounts::SearchController < Api::BaseController
followers: truthy_param?(:followers),
following: truthy_param?(:following),
group_only: truthy_param?(:group_only),
offset: params[:offset]
offset: params[:offset],
language: current_user.setting_default_language
)
end
end

View File

@ -26,7 +26,7 @@ class Api::V2::SearchController < Api::BaseController
params[:q],
current_account,
limit_param(RESULTS_LIMIT),
search_params.merge(resolve: truthy_param?(:resolve), exclude_unreviewed: truthy_param?(:exclude_unreviewed))
search_params.merge(resolve: truthy_param?(:resolve), exclude_unreviewed: truthy_param?(:exclude_unreviewed), language: current_user.setting_default_language)
)
end

View File

@ -4,7 +4,8 @@ class AccountSearchQueryTransformer < Parslet::Transform
class Query
attr_reader :should_clauses, :must_not_clauses, :must_clauses
def initialize(clauses)
def initialize(clauses, language)
@fields = ['text'].push(%w(ja ko zh).include?(language) ? "text.#{language}_stemmed" : 'text.en_stemmed')
grouped = clauses.chunk(&:operator).to_h
@should_clauses = grouped.fetch(:should, [])
@must_not_clauses = grouped.fetch(:must_not, [])
@ -23,7 +24,7 @@ class AccountSearchQueryTransformer < Parslet::Transform
def clause_to_query(clause)
case clause
when TermClause
{ multi_match: { type: 'most_fields', query: clause.term, fields: ['text.edge_ngram', 'text.stemmed'] } }
{ multi_match: { type: 'most_fields', query: clause.term, fields: @fields } }
when PhraseClause
{ match_phrase: { text: { query: clause.phrase } } }
else
@ -102,5 +103,5 @@ class AccountSearchQueryTransformer < Parslet::Transform
end
end
rule(query: sequence(:clauses)) { Query.new(clauses) }
rule(query: sequence(:clauses)) { Query.new(clauses, 'ja') }
end

View File

@ -4,7 +4,8 @@ class SearchQueryTransformer < Parslet::Transform
class Query
attr_reader :should_clauses, :must_not_clauses, :must_clauses, :filter_clauses, :order_clauses
def initialize(clauses)
def initialize(clauses, language)
@fields = ['text'].push(%w(ja ko zh).include?(language) ? "text.#{language}_stemmed" : 'text.en_stemmed')
grouped = clauses.chunk(&:operator).to_h
@should_clauses = grouped.fetch(:should, [])
@must_not_clauses = grouped.fetch(:must_not, [])
@ -27,7 +28,7 @@ class SearchQueryTransformer < Parslet::Transform
def clause_to_query(clause)
case clause
when TermClause
{ multi_match: { type: 'most_fields', query: clause.term, fields: ['text', 'text.stemmed'] } }
{ multi_match: { type: 'most_fields', query: clause.term, fields: @fields } }
when PhraseClause
{ match_phrase: { text: { query: clause.phrase } } }
else
@ -94,7 +95,7 @@ class SearchQueryTransformer < Parslet::Transform
class PrefixClause
attr_reader :filter, :operator, :term
def initialize(prefix, term)
def initialize(prefix, operator, term)
case prefix
when 'from'
@operator = :filter
@ -106,7 +107,7 @@ class SearchQueryTransformer < Parslet::Transform
@term = account.id
when 'order'
raise "Unknown order: #{term}" unless %w(asc desc).include?(term)
raise "Unknown order: #{term}" unless %w(asc desc score).include?(term)
@operator = :order
@term = term
@ -121,7 +122,7 @@ class SearchQueryTransformer < Parslet::Transform
operator = clause[:operator]&.to_s
if clause[:prefix]
PrefixClause.new(prefix, clause[:term].to_s)
PrefixClause.new(prefix, operator, clause[:term].to_s)
elsif clause[:term]
TermClause.new(prefix, operator, clause[:term].to_s)
elsif clause[:shortcode]
@ -133,5 +134,5 @@ class SearchQueryTransformer < Parslet::Transform
end
end
rule(query: sequence(:clauses)) { Query.new(clauses) }
rule(query: sequence(:clauses)) { Query.new(clauses, 'ja') }
end

View File

@ -14,6 +14,8 @@
class Mention < ApplicationRecord
include Paginable
update_index('statuses') { status }
belongs_to :account, inverse_of: :mentions
belongs_to :status

View File

@ -37,6 +37,8 @@ class PreviewCard < ApplicationRecord
self.inheritance_column = false
update_index('statuses') { statuses }
enum type: [:link, :photo, :video, :rich]
has_and_belongs_to_many :statuses

View File

@ -178,6 +178,7 @@ class Status < ApplicationRecord
ids += favourites.where(account: Account.local).pluck(:account_id)
ids += reblogs.where(account: Account.local).pluck(:account_id)
ids += bookmarks.where(account: Account.local).pluck(:account_id)
ids += poll.votes.where(account: Account.local).pluck(:account_id) if poll.present?
ids += emoji_reactions.where(account: Account.local).pluck(:account_id)
ids += referred_by_statuses.where(account: Account.local).pluck(:account_id)
else
@ -185,6 +186,7 @@ class Status < ApplicationRecord
ids += preloaded.favourites[id] || []
ids += preloaded.reblogs[id] || []
ids += preloaded.bookmarks[id] || []
ids += preloaded.votes[id] || []
ids += preloaded.emoji_reactions[id] || []
ids += preloaded.status_references[id] || []
end
@ -331,6 +333,22 @@ class Status < ApplicationRecord
@index_text ||= [spoiler_text, Formatter.instance.plaintext(self)].concat(media_attachments.map(&:description)).concat(preloadable_poll ? preloadable_poll.options : []).concat(quote? ? ["QT: [#{quote.url || ActivityPub::TagManager.instance.url_for(quote)}]"] : []).filter(&:present?).join("\n\n")
end
def tag_id
tags.map(&:id)
end
def mentioned_account_id
mentions.map(&:account_id)
end
def media_type
media_attachments&.first&.type
end
def reference_type
preview_card&.type
end
def replies_count
status_stat&.replies_count || 0
end

View File

@ -17,6 +17,8 @@
#
class StatusStat < ApplicationRecord
update_index('statuses') { status }
belongs_to :status, inverse_of: :status_stat
after_commit :reset_parent_cache

View File

@ -8,6 +8,8 @@ class AccountSearchService < BaseService
@query = query&.strip&.gsub(/\A@/, '')
@limit = options[:limit].to_i
@offset = options[:offset].to_i
@lang = options.delete(:language).to_s
@fields = %w(acct.edge_ngram acct display_name).push(%w(ja ko zh).include?(@lang) ? "display_name.#{@lang}_stemmed" : 'display_name.edge_ngram')
@options = options
@account = account
@ -85,7 +87,7 @@ class AccountSearchService < BaseService
end
def from_elasticsearch(count = false)
must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct.edge_ngram acct) : %w(acct.edge_ngram acct display_name.edge_ngram display_name), type: 'most_fields', operator: 'and' } }]
must_clauses = [{ multi_match: { query: terms_for_query, fields: likely_acct? ? %w(acct.edge_ngram acct) : @fields, type: 'most_fields', operator: 'and' } }]
should_clauses = []
if account

View File

@ -46,7 +46,8 @@ class SearchService < BaseService
@account,
limit: @limit,
resolve: @resolve,
offset: @offset
offset: @offset,
language: @options[:language]
)
end
@ -54,6 +55,7 @@ class SearchService < BaseService
AccountSearchService.new.count(
@query,
@account,
language: @options[:language]
)
end
@ -63,7 +65,8 @@ class SearchService < BaseService
@account,
limit: @limit,
resolve: @resolve,
offset: @offset
offset: @offset,
language: @options[:language]
)
end
@ -109,7 +112,8 @@ class SearchService < BaseService
@query,
limit: @limit,
offset: @offset,
exclude_unreviewed: @options[:exclude_unreviewed]
exclude_unreviewed: @options[:exclude_unreviewed],
language: @options[:language]
)
end
@ -146,7 +150,7 @@ class SearchService < BaseService
end
def account_searchable?
account_search? && (account_search_explicit_pattern? || @query.match?(/\A#{Account::USERNAME_RE}\Z/))
account_search?
end
def hashtag_searchable?

View File

@ -5,6 +5,8 @@ class TagSearchService < BaseService
@query = query.strip.gsub(/\A#/, '')
@offset = options.delete(:offset).to_i
@limit = options.delete(:limit).to_i
@lang = options.delete(:language).to_s
@fields = ['name'].push(%w(ja ko zh).include?(@lang) ? "name.#{@lang}_stemmed" : 'name.edge_ngram')
@options = options
results = from_elasticsearch if Chewy.enabled?
@ -21,7 +23,7 @@ class TagSearchService < BaseService
query: {
multi_match: {
query: @query,
fields: %w(name.edge_ngram name),
fields: @fields,
type: 'most_fields',
operator: 'and',
},