Class: FeedMe::ParserBuilder

Inherits:
Object
  • Object
show all
Defined in:
lib/feedme.rb

Overview

This class is used to create promiscuous parsers.

Direct Known Subclasses

StrictParserBuilder

Instance Attribute Summary (collapse)

Instance Method Summary (collapse)

Constructor Details

- (ParserBuilder) initialize(options = {})

Create a new ParserBuilder. Allowed options are:

  • :empty_string_for_nil => false # return the empty string instead of a nil value

  • :error_on_missing_key => false # raise an error if a specified key or virtual method does not exist (otherwise nil is returned)



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# File 'lib/feedme.rb', line 76

def initialize(options={})
  @options = options
  
  # rss tags
  @rss_tags = [
    {
      :image     => nil,
      :textinput => nil,
      :skiphours => nil,
      :skipdays  => nil,
      :items     => [{ :rdf_seq => nil }],
     #:item      => @rss_item_tags
    }
  ]
  @rss_item_tags = [ {} ]

  #atom tags
  @atom_tags = [
    {
      :author       => nil,
      :contributor  => nil,
     #:entry        => @atom_entry_tags
    }
  ]
  @atom_entry_tags = [
    {
      :author       => nil, 
      :contributor  => nil
    }
  ]

  # tags whose value is a date
  @date_tags = [ :pubdate, :lastbuilddate, :published, :updated, :dc_date, 
    :expirationdate ]
  
  # tags that can be used as the default value for a mixed element
  @value_tags = {
    :media_content => :url
  }
  @default_value_tags = [ CONTENT_KEY, :href, :url ]
  
  # methods for selecting the element to return when the singular accessor
  # is called on a tag with multiple values
  @value_selectors = {
    :link => proc do |links|
      links = links.sort do |a,b|
        i1 = DEFAULT_RELS.index(a.rel)
        i2 = DEFAULT_RELS.index(b.rel)
        i1.nil? ? (i2.nil? ? 0 : 1) : (i2.nil? ? -1 : i1 <=> i2)
      end
      links.first
    end
  }
  @default_value_selector = proc do |x|
    x = x.sort do |a,b|
      a.is_a?(String) ? -1 : (b.is_a?(String) ? 1 : 0)
    end
    x.first
  end
  
  # tag/attribute aliases
  @aliases = {
    :items        => :item_array,
    :item_array   => :entry_array,
    :entries      => :entry_array,
    :entry_array  => :item_array,
    :link         => :link+self'
  }
  
  # transformations
  @html_helper_lib = HPRICOT_HELPER
  @default_transformation = [ :cleanHtml ]
  @transformations = {}
  @transformation_fns = {
    # remove all HTML tags
    :stripHtml => proc do |str| 
      require @html_helper_lib
      FeedMe.html_helper.strip_html(str)
    end,
    
    # clean HTML content using FeedNormalizer's HtmlCleaner class
    :cleanHtml => proc do |str| 
      require @html_helper_lib
      FeedMe.html_helper.clean_html(str)
    end, 
    
    # wrap text at a certain number of characters (respecting word boundaries)
    :wrap => proc do |str, col| 
      str.gsub(/(.{1,#{col}})( +|$\n?)|(.{1,#{col}})/, "\\1\\3\n").strip 
    end,
    
    # truncate text, respecting word boundaries
    :trunc => proc {|str, wordcount| str.trunc(wordcount.to_i) },
    
    # truncate HTML and leave enclosing HTML tags
    :truncHtml => proc do |str, wordcount| 
      require @html_helper_lib
      FeedMe.html_helper.truncate_html(str, wordcount.to_i)
    end,
    
    :regexp => proc do |str, regexp|
      match = Regexp.new(regexp).match(str)
      match.nil? ? nil : match[1]
    end,
    
    # this shouldn't be necessary since all text is automatically
    # unescaped, but some feeds double-escape HTML
    :esc => proc {|str| CGI.unescapeHTML(str) },
    
    # apply an arbitrary function
    :apply => proc {|str, fn, *args| fn.call(str, *args) }
  }
end

Instance Attribute Details

- (Object) aliases

A hash of attribute/tag name aliases.



58
59
60
# File 'lib/feedme.rb', line 58

def aliases
  @aliases
end

- (Object) atom_entry_tags

The subtags of entry elements that are parsed for Atom feeds.



44
45
46
# File 'lib/feedme.rb', line 44

def 
  @atom_entry_tags
end

- (Object) atom_tags

The tags that are parsed for Atom feeds.



42
43
44
# File 'lib/feedme.rb', line 42

def atom_tags
  @atom_tags
end

- (Object) date_tags

The names of tags that should be parsed as date values.



46
47
48
# File 'lib/feedme.rb', line 46

def date_tags
  @date_tags
end

- (Object) default_transformation

An array of the transformation functions applied when the ! suffix is added to the attribute/tag name.



61
62
63
# File 'lib/feedme.rb', line 61

def default_transformation
  @default_transformation
end

- (Object) default_value_selector

Value selector to use if there is no value selector defined for a tag



56
57
58
# File 'lib/feedme.rb', line 56

def default_value_selector
  @default_value_selector
end

- (Object) default_value_tags

Tags to use for element value when specific tag isn't specified



51
52
53
# File 'lib/feedme.rb', line 51

def default_value_tags
  @default_value_tags
end

- (Object) html_helper_lib

the helper library used for HTML transformations



70
71
72
# File 'lib/feedme.rb', line 70

def html_helper_lib
  @html_helper_lib
end

- (Object) options (readonly)

The options passed to this ParserBuilder's constructor.



36
37
38
# File 'lib/feedme.rb', line 36

def options
  @options
end

- (Object) rss_item_tags

The subtags of item elements that are parsed for RSS feeds.



40
41
42
# File 'lib/feedme.rb', line 40

def rss_item_tags
  @rss_item_tags
end

- (Object) rss_tags

The tags that are parsed for RSS feeds.



38
39
40
# File 'lib/feedme.rb', line 38

def rss_tags
  @rss_tags
end

- (Object) transformation_fns

Mapping of transformation function names to Procs.



68
69
70
# File 'lib/feedme.rb', line 68

def transformation_fns
  @transformation_fns
end

- (Object) transformations

Mapping of transformation names to functions. Each key is a suffix that can be appended to an attribute/tag name, and the value is an array of transformation function names that are applied when that transformation is used.



66
67
68
# File 'lib/feedme.rb', line 66

def transformations
  @transformations
end

- (Object) value_selectors

A hash of functions for selecting the correct value to return when a tags has multiple values and the singluar accessor is called



54
55
56
# File 'lib/feedme.rb', line 54

def value_selectors
  @value_selectors
end

- (Object) value_tags

An array of names of attributes/subtags whose values can be used as the default value of a mixed element.



49
50
51
# File 'lib/feedme.rb', line 49

def value_tags
  @value_tags
end

Instance Method Details

- (Object) all_atom_tags

Prepare tag list for an Atom feed.



198
199
200
201
202
# File 'lib/feedme.rb', line 198

def all_atom_tags
  all_tags = atom_tags.dup
  all_tags[0][:entry] = .dup
  return all_tags
end

- (Object) all_rss_tags

Prepare tag list for an RSS feed.



191
192
193
194
195
# File 'lib/feedme.rb', line 191

def all_rss_tags
  all_tags = rss_tags.dup
  all_tags[0][:item] = rss_item_tags.dup
  return all_tags
end

- (Object) emulate_atom!

Add aliases so that RSS feed elements can be accessed using the names of their Atom counterparts.



221
222
223
224
225
226
227
228
229
230
231
232
233
234
# File 'lib/feedme.rb', line 221

def emulate_atom!
  aliases.merge!({
    :rights       => :copyright,
    :content      => :description,
    :contributor  => :author,
    :id           => [ :guid_value, :link ],
    :author       => [ :managingeditor, :webmaster ],
    :updated      => [ :lastbuilddate, :pubdate ],
    :published    => [ :pubDate, :lastbuilddate ],
    :icon         => :image/url',
    :logo         => :image/url',
    :summary      => :description_trunc'
  })
end

- (Object) emulate_rss!

Add aliases so that Atom feed elements can be accessed using the names of their RSS counterparts.



206
207
208
209
210
211
212
213
214
215
216
217
# File 'lib/feedme.rb', line 206

def emulate_rss!
  aliases.merge!({
    :guid           => :id,       # this alias never actually gets used; see FeedData#id
    :copyright      => :rights,
    :pubdate        => [ :published, :updated ],
    :lastbuilddate  => [ :updated, :published ],
    :description    => [ :content, :summary ],
    :managingeditor => [ :author/name', :contributor/name' ],
    :webmaster      => [ :author/name', :contributor/name' ],
    :image          => [ :icon, :logo ]
  })
end

- (Object) parse(source)

Parse source using a Parser created from this ParserBuilder.



237
238
239
# File 'lib/feedme.rb', line 237

def parse(source)
  Parser.new(self, source, options)
end