Class: Browser

Inherits:
Object show all
Defined in:
lib/epitools/browser.rb,
lib/epitools/browser/cache.rb

Overview

A mechanize class that emulates a web-browser, with cache and everything. Progress bars are enabled by default.

Defined Under Namespace

Classes: Cache

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ Browser

Default options:

:delay => 1,                      # Sleep 1 second between gets
:delay_jitter => 0.2,             # Random deviation from delay
:use_cache => true,               # Cache all gets
:use_logs => false,               # Don't log the detailed transfer info
:cookie_file => "cookies.txt"     # Save cookies to file

39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# File 'lib/epitools/browser.rb', line 39

def initialize(options={})
  @last_get     = Time.at(0)
  @delay        = options[:delay]          || 1
  @delay_jitter = options[:delay_jitter]   || 0.2
  @use_cache    = !!(options[:cache] || options[:cached] || options[:use_cache])
  @use_logs     = options[:logs]           || false
  @cookie_file  = options[:cookiefile]     || "cookies.txt"
  @cache_file   = options[:cache_file]     || "browser-cache.db"

  # TODO: @progress, @user_agent, @logfile, @cache_file (default location: ~/.epitools?)

  if options[:proxy]
    host, port = options[:proxy].split(':')
    TCPSocket::socks_server = host
    TCPSocket::socks_port   = port.to_i
  end

  init_agent!
  init_cache!
end

Instance Attribute Details

#agentObject

Returns the value of attribute agent


29
30
31
# File 'lib/epitools/browser.rb', line 29

def agent
  @agent
end

#cacheObject

Returns the value of attribute cache


29
30
31
# File 'lib/epitools/browser.rb', line 29

def cache
  @cache
end

#delay(override_delay = nil, override_jitter = nil) ⇒ Object

Returns the value of attribute delay


29
30
31
# File 'lib/epitools/browser.rb', line 29

def delay
  @delay
end

#delay_jitterObject

Returns the value of attribute delay_jitter


29
30
31
# File 'lib/epitools/browser.rb', line 29

def delay_jitter
  @delay_jitter
end

#use_cacheObject

Returns the value of attribute use_cache


29
30
31
# File 'lib/epitools/browser.rb', line 29

def use_cache
  @use_cache
end

Instance Method Details

#cache_put(page, url) ⇒ Object


112
113
114
115
116
117
118
119
# File 'lib/epitools/browser.rb', line 112

def cache_put(page, url)
  if cache.valid_page?(page)
    if page.content_type =~ %r{(^text/|^application/javascript|javascript)}
      puts "  |_ writing to cache"
      cache.put(page, url, :overwrite=>true)
    end
  end
end

#cacheable?(page) ⇒ Boolean


105
106
107
108
109
110
# File 'lib/epitools/browser.rb', line 105

def cacheable?(page)
  case page.content_type
  when %r{^(text|application)}
    true
  end
end

#get(url, options = {}) ⇒ Object

Retrieve an URL, and return a Mechanize::Page instance (which acts a bit like a Nokogiri::HTML::Document instance.)

Options:

:cached => true/false   | check cache before getting page

129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# File 'lib/epitools/browser.rb', line 129

def get(url, options={})

  # TODO: Have a base-URL option

  #if relative?(url)
  #  url = URI.join("http://base-url/", url).to_s
  #end

  # Determine the cache setting
  use_cache = options[:cached].nil? ? @use_cache : options[:cached]

  cached_already = cache.include?(url) if use_cache

  puts
  puts "[ GET #{url} (using cache: #{!!use_cache}) ]"

  delay unless cached_already
  max_retries = 4
  retries = 0

  begin

    if use_cache and page = cache.get(url)
      puts "  |_ cached (#{page.content_type})"
    else
      page = agent.get(url)
      @last_get = Time.now
      cache_put(page, url) if use_cache
    end

    puts

  rescue Net::HTTPBadResponse, Errno::ECONNRESET, SocketError, Timeout::Error, SOCKSError => e
    raise if e.message == "getaddrinfo: Name or service not known"

    retries += 1
    return if retries >= max_retries

    puts "  |_ ERROR: #{e.inspect} -- retrying"
    delay(5)
    retry

=begin
  rescue Mechanize::ResponseCodeError => e

    case e.response_code
      when "401" #=> Net::HTTPUnauthorized
        p e
        login!
        page = get(url)
        puts
      when "404"
        p e
        raise e
      when "503"
        puts "  |_ ERROR: #{e.inspect} -- retrying"
        delay(5)
        retry
    else
      raise e
    end
=end

  end

  page
end

#init_agent!Object


60
61
62
63
64
65
66
67
68
69
# File 'lib/epitools/browser.rb', line 60

def init_agent!
  @agent = Mechanize.new do |a|
    # ["Mechanize", "Mac Mozilla", "Linux Mozilla", "Windows IE 6", "iPhone", "Linux Konqueror", "Windows IE 7", "Mac FireFox", "Mac Safari", "Windows Mozilla"]
    a.max_history = 10
    a.user_agent_alias = "Windows Chrome"
    a.log = Logger.new "mechanize.log" if @use_logs
  end

  load_cookies!
end

#init_cache!Object


82
83
84
85
# File 'lib/epitools/browser.rb', line 82

def init_cache!
  # TODO: Rescue "couldn't load" exception and disable caching
  @cache = Cache.new(@cache_file, agent) if @use_cache
end

#load_cookies!Object


87
88
89
90
91
92
93
94
# File 'lib/epitools/browser.rb', line 87

def load_cookies!
  if File.exists? @cookie_file
    agent.cookie_jar.load @cookie_file
    true
  else
    false
  end
end

#relative?(url) ⇒ Boolean


101
102
103
# File 'lib/epitools/browser.rb', line 101

def relative?(url)
  not url[ %r{^https?://} ]
end

#save_cookies!Object


96
97
98
99
# File 'lib/epitools/browser.rb', line 96

def save_cookies!
  agent.cookie_jar.save_as @cookie_file
  true
end