Module: Myaso::Mystem

Extended by:
Mystem
Included in:
Mystem
Defined in:
lib/myaso/mystem.rb

Overview

Mystem is a popular morphological analyzer for Russian that is written in Yandex by Ilya Segalovich and Vitaly Titov. The analyzer can efficiently deal with non-dictionary word and produce hypotheses for such words. It is available on <tech.yandex.ru/mystem/>.

Defined Under Namespace

Modules: Library Classes: Form, Lemma

Instance Method Summary collapse

Instance Method Details

#analyze(word) ⇒ Object

Analyzes a word and returns an array of lemmas, each of which represent a particular ambiguous morphological interpretation.

:call-seq:

analyze(String)

74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# File 'lib/myaso/mystem.rb', line 74

def analyze(word)
  Array.new.tap do |lemmas|
    invoke_analyze(as_symbols(word), word.length) do |lemma|
      lemma_text = MystemLemmaText(lemma)
      lemma_text_len = MystemLemmaTextLen(lemma)

      form_text = MystemLemmaForm(lemma)
      form_text_len = MystemLemmaFormLen(lemma)

      stem_grammemes = MystemLemmaStemGram(lemma).bytes
      flex_grammemes_raw = MystemLemmaFlexGram(lemma)
      flex_grammemes_len = MystemLemmaFlexGramNum(lemma)
      flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
      grammemes = stem_grammemes | flex_grammemes

      lemmas << Lemma.new(
        as_string(lemma_text, lemma_text_len),        # lemma
        as_string(form_text, form_text_len),          # form
        QUALITY[MystemLemmaQuality(lemma)],           # quality
        Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
        stem_grammemes,                               # stem_grammemes
        flex_grammemes,                               # flex_grammemes
        MystemLemmaFlexLen(lemma),                    # flex_length
        MystemLemmaRuleId(lemma)                      # rule_id
      )
    end
  end
end

#forms(word, rule_id) ⇒ Object

Analyzes a word and returns an array of its forms as according to the given rule_id.

:call-seq:

forms(String, Fixnum)

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/myaso/mystem.rb', line 108

def forms(word, rule_id)
  Array.new.tap do |forms|
    invoke_analyze(as_symbols(word), word.length) do |lemma|
      next unless rule_id == MystemLemmaRuleId(lemma)

      invoke_generate(lemma) do |form|
        form_text = MystemFormText(form)
        form_text_len = MystemFormTextLen(form)

        stem_grammemes = MystemFormStemGram(form).bytes
        flex_grammemes_raw = MystemFormFlexGram(form)
        flex_grammemes_len = MystemFormFlexGramNum(form)
        flex_grammemes = as_strings(flex_grammemes_raw, flex_grammemes_len)
        grammemes = stem_grammemes | flex_grammemes

        forms << Form.new(
          as_string(form_text, form_text_len),          # form
          Myasorubka::Mystem::Binary.to_msd(grammemes), # msd
          stem_grammemes,                               # stem_grammemes
          flex_grammemes,                               # flex_grammemes
        )
      end
    end
  end
end

#inflect(forms, grammemes) ⇒ Object

Finds exact matches of grammemes for the provided forms of a word. It is necessary to be careful because computational linguistics is a hard field.

:call-seq:

inflect([Form], Hash)

140
141
142
143
144
# File 'lib/myaso/mystem.rb', line 140

def inflect(forms, grammemes)
  forms.select do |form|
    grammemes.inject(true) { |r, (k, v)| r && form.msd.grammemes[k] == v }
  end
end