Class: Chinese::Scraper

Inherits:
Object
  • Object
show all
Includes:
HelperMethods, WithValidations
Defined in:
lib/chinese/scraper.rb

Constant Summary

Sources =
{
  nciku:
  {:url         => "http://www.nciku.com/search/all/examples/",
   :parent_sel  => "div.examples_box > dl",
   :cn_sel      => "//dt/span[1]",
   :en_sel      => "//dd/span[@class='tc_sub']",
                   # Only cn/en sentence pairs where the second node has a class 'tc_sub' belong together.
   :select_pair => lambda { |node1,node2| node1['class'] != "tc_sub" && node2['class'] == "tc_sub" },
                   # Just return the text stored in the node. :text_sel is mainly intended for jukuu (see below)
   :text_sel    => "text()",
                   # We want cn first, en second, but nciku does not return cn/en sentence pairs in a strict order.
   :reorder     => lambda { |text1,text2| if is_unicode?(text2) then [text2,text1] else [text1,text2] end }},
   jukuu:
   {:url         => "http://www.jukuu.com/search.php?q=",
    :parent_sel  => "table#Table1 table[width = '680']",
    :cn_sel      => "//tr[@class='c']",
    :en_sel      => "//tr[@class='e']",
                   # Only cn/en sentence pairs where the first node has a class 'e' belong together.
    :select_pair => lambda { |node1,node2| node1['class'] == "e" && node2['class'] != "e" },
    :text_sel    => "td[2]",
    :reorder     => lambda { |text1,text2| [text2,text1] }}
}
OPTIONS =
{:source =>  [:nciku,  lambda {|value| Sources.keys.include?(value) }],
:size   =>  [:average, lambda {|value| [:short, :average, :long].include?(value) }]}

Instance Attribute Summary (collapse)

Class Method Summary (collapse)

Methods included from HelperMethods

#distinct_words, #include_every_char?, #is_unicode?

Instance Attribute Details

- (Object) sentences

Returns the value of attribute sentences



16
17
18
# File 'lib/chinese/scraper.rb', line 16

def sentences
  @sentences
end

- (Object) source (readonly)

Returns the value of attribute source



15
16
17
# File 'lib/chinese/scraper.rb', line 15

def source
  @source
end

- (Object) word (readonly)

Returns the value of attribute word



15
16
17
# File 'lib/chinese/scraper.rb', line 15

def word
  @word
end

Class Method Details

+ (Object) average_size(sentence_pairs)



133
134
135
136
137
# File 'lib/chinese/scraper.rb', line 133

def self.average_size(sentence_pairs)
  sorted = sentence_pairs.sort_by {|(cn,_)| cn.length }
  length = sorted.length
  sorted.find {|(cn,_)| cn.size >= length/2 }
end

+ (Object) longest_size(sentence_pairs)



129
130
131
# File 'lib/chinese/scraper.rb', line 129

def self.longest_size(sentence_pairs)
  sentence_pairs.sort_by {|(cn,_)| cn.length }.last
end

+ (Boolean) pair_with_empty_string?(pair)

Helper methods

Returns:

  • (Boolean)


118
119
120
# File 'lib/chinese/scraper.rb', line 118

def self.pair_with_empty_string?(pair)
  pair[0].empty? || pair[1].empty?
end

+ (Object) sentence(word, options = {})



97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
# File 'lib/chinese/scraper.rb', line 97

def self.sentence(word, options={})
  value = validate { :size }

  scraped_sentences = sentences(word, options)
  return [] if scraped_sentences.empty?

  case value
  when :short
    shortest_size(scraped_sentences)
  when :average
    average_size(scraped_sentences)
  when :long
    longest_size(scraped_sentences)
  end
end

+ (Object) sentences(word, options = {})

Options: size => [:short, :average, :long], default = :average



47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# File 'lib/chinese/scraper.rb', line 47

def self.sentences(word, options={})
  download_source = validate { :source }

  source = Sources[download_source]

  CGI.accept_charset = 'UTF-8'
  # Note: Use + because << changes the object on its left hand side, but + doesn't:
  # http://stackoverflow.com/questions/377768/string-concatenation-and-ruby/378258#378258
  url       = source[:url] + CGI.escape(word)
  # http://ruby-doc.org/stdlib-1.9.2/libdoc/timeout/rdoc/Timeout.html#method-c-timeout
  content   = Timeout.timeout(20) { open(url) }
  main_node = Nokogiri::HTML(content).css(source[:parent_sel]) # Returns a single node.
  return []  if main_node.to_a.empty?

  # CSS selector:   Returns the tags in the order they are specified
  # XPath selector: Return the tags in the order they appear in the document (that's what we want here).
  # Source:         http://stackoverflow.com/questions/5825136/nokogiri-and-finding-element-by-name/5845985#5845985
  target_nodes = main_node.search("#{source[:cn_sel]} | #{source[:en_sel]}")
  return [] if target_nodes.to_a.empty?

  # In order to make sure we only return text that also has a translation,
  # we need to first group each target node with Array#overlap_pairs like this:
  # Input:  [cn1, cn2, en2, cn3, en3, cn4]
  # Output: [[cn1,cn2],[cn2,en2],[en2,cn3],[cn3,en3],[en3,cn4]]
  # and then select the correct pairs: [[cn2,en2],[cn3,en3]].
  # Regarding #to_a: Nokogiri::XML::NodeSet => Array
  sentence_pairs = target_nodes.to_a.overlap_pairs.select {|(node1,node2)| source[:select_pair].call(node1,node2) }
  sentence_pairs = sentence_pairs.reduce([]) do |acc,(cn_node,en_node)|
    cn   = cn_node.css(source[:text_sel]).text.strip  # 'text' returns an empty string when 'css' returns an empty array.
    en   = en_node.css(source[:text_sel]).text.strip
    pair = [cn,en]
    # Ensure that both the chinese and english selector have text.
    # (sometimes they don't).
    acc << pair unless pair_with_empty_string?(pair)
    acc
  end
  # Switch position of each pair if the first entry is the translation,
  # as we always return an array of [cn_sentence,en_sentence] pairs.
  # The following step is necessary because:
  # 1) Jukuu returns sentences in the order English first, Chinese second
  # 2) Nciku mostly returns sentences in the order Chinese first, English second
  #    (but sometimes it is the other way round.)
  sentence_pairs = sentence_pairs.map {|node1,node2| source[:reorder].call(node1,node2) }
  # Only select Chinese sentences that don't separate words, e.g., skip all sentences like the following:
  # 北边 => 树林边的河流向北方
  sentence_pairs = sentence_pairs.select { |cn, _| include_every_char?(word, cn) }

  sentence_pairs
end

+ (Object) shortest_size(sentence_pairs)

Despite its name returns the SECOND shortest sentence, as the shortest result often is not a real sentence, but a definition.



125
126
127
# File 'lib/chinese/scraper.rb', line 125

def self.shortest_size(sentence_pairs)
  sentence_pairs.sort_by {|(cn,_)| cn.length }.take(2).last
end