Class: Link

Inherits:
ActiveRecord::Base
  • Object
show all
Defined in:
app/models/link.rb

Constant Summary

WANTED =

link types

1
INTERNAL =
2
EXTERNAL =
3
MAILTO =
4
CATEGORY =
5
DIRECTFILE =
6
LOCAL =
7
IMAGE =
8
OK =

status codes

1
OK_REDIRECT =
2
WARNING =
3
BROKEN =
4
IGNORED =
5
MAX_WARNING_COUNT =

maximum number of times a broken link reports broken before warning goes to error

3
MAX_ERROR_COUNT =

maximum number of times we'll check a broken link before giving up

10

Class Method Summary (collapse)

Instance Method Summary (collapse)

Class Method Details

+ (Object) check_url(url, make_get_request = false)



519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
# File 'app/models/link.rb', line 519

def self.check_url(url,make_get_request=false)
  headers = {'User-Agent' => 'extension.org link verification'}
  # the URL should have likely already be validated, but let's do it again for good measure
  begin
    check_uri = URI.parse("#{url}")
  rescue Exception => exception
    return {:responded => false, :error => exception.message}
  end
  
  if(check_uri.scheme != 'http' and check_uri.scheme != 'https')
    return {:responded => false, :ignored => true}
  end
    
  # check it!
  begin
    response = nil
    http_connection = Net::HTTP.new(check_uri.host, check_uri.port)
    if(check_uri.scheme == 'https')
      # don't verify cert?
      http_connection.verify_mode = OpenSSL::SSL::VERIFY_NONE
      http_connection.use_ssl = true 
    end
    request_path = !check_uri.path.blank? ? check_uri.path : "/"
    if(!check_uri.query.blank?)
      request_path += "?" + check_uri.query
    end
      
    if(!make_get_request)
      response = http_connection.head(request_path,headers)   
    else
      response = http_connection.get(request_path,headers)   
    end
    {:responded => true, :code => response.code, :response => response}
  rescue Exception => exception
    return {:responded => false, :error => exception.message}
  end
end

+ (Object) count_by_linktype



580
581
582
583
584
585
586
587
# File 'app/models/link.rb', line 580

def self.count_by_linktype
  returnhash = {}
  linkcounts = Link.count(:group => :linktype)
  linkcounts.each do |linktype,count|
    returnhash[self.linktype_to_description(linktype)] = count
  end
  returnhash
end

+ (Object) create_from_page(page)



191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
# File 'app/models/link.rb', line 191

def self.create_from_page(page)
  if(page.source_url.blank?)
    return nil
  end
  
  # make sure the URL is valid format
  begin
    source_uri = URI.parse(page.source_url)
    source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(source_uri.to_s.downcase))
  rescue
    return nil
  end
  
  # special case for where the alternate != source_url
  if(page.alternate_source_url != page.source_url)
    begin 
      alternate_source_uri = URI.parse(page.alternate_source_url)
      alternate_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alternate_source_uri.to_s.downcase))
    rescue
      # do nothing
    end
  end  
  
  # specical case for create urls - does this have an alias_uri?
  if(page.page_source and page.page_source.name == 'create')    
    if(!page.old_source_url.blank?)
      begin 
        old_source_uri = URI.parse(page.old_source_url)
        old_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(old_source_uri.to_s.downcase))
      rescue
        # do nothing
      end
    elsif(migrated_url = MigratedUrl.find_by_target_url_fingerprint(source_uri_fingerprint))
      old_source_uri = migrated_url.alias_url
      old_source_uri_fingerprint = migrated_url = migrated_url.alias_url_fingerprint
    end
  end
  
  find_condition = "fingerprint = '#{source_uri_fingerprint}'"
  if(alternate_source_uri)
    find_condition += " OR alternate_fingerprint = '#{alternate_source_uri_fingerprint}'"
  end
  if(old_source_uri)
    find_condition += " OR alias_fingerprint = '#{old_source_uri_fingerprint}'"
  end
  
    
  if(this_link = self.where(find_condition).first)
    # this was a wanted link - we need to update the link now - and kick off the process of updating everything
    # that links to this page
    this_link.update_attributes(:page => page, :linktype => INTERNAL)
    this_link.linkedpages.each do |linked_page|
      linked_page.store_content # parses links and images again and saves it.
    end
  else    
    this_link = self.new(:page => page, :url => source_uri.to_s, :fingerprint => source_uri_fingerprint)
    
    if(alternate_source_uri)
      this_link.alternate_url = alternate_source_uri.to_s
      this_link.alternate_fingerprint = alternate_source_uri_fingerprint
    end
    
    if(old_source_uri)
      this_link.alias_url = old_source_uri.to_s
      this_link.alias_fingerprint = old_source_uri_fingerprint
    end
      
    this_link.source_host = source_uri.host
    this_link.linktype = INTERNAL
  
    # set host and path - mainly just for aggregation purposes
    if(!source_uri.host.blank?)
      this_link.host = source_uri.host
    end
    if(!source_uri.path.blank?)
      this_link.path = CGI.unescape(source_uri.path)
    end
    this_link.save
  end
  return this_link

  return returnlink
end

+ (Object) find_or_create_by_image_reference(image_reference, source_host)

this is meant to be called when parsing a piece of content for items it links out to from its content.



406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
# File 'app/models/link.rb', line 406

def self.find_or_create_by_image_reference(image_reference,source_host)
  # make sure the URL is valid format
  begin
    original_uri = URI.parse(image_reference)
  rescue
    return nil
  end  

  if(original_uri.scheme == 'data')
    return nil
  end


  if(original_uri.host.blank?) 
    # wiki link exception inside existing create articles that we still have
    if(original_uri.path =~ %r{^/mediawiki/} and source_host == 'create.extension.org')
      original_uri.host = 'cop.extension.org'
    else
      original_uri.host = source_host 
    end
  end
  original_uri.scheme = 'http' if(original_uri.scheme.blank?)

  # for comparison purposes - we need to drop the fragment - the caller is going to
  # need to maintain the fragment when they get an URI back from this class.
  if(!original_uri.fragment.blank?)
    original_uri.fragment = nil
  end
  
  if(this_link = self.find_by_fingerprint(Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s))))
    if(this_link.linktype != IMAGE)
      this_link.update_attribute(:linktype, IMAGE)
    end
    return this_link
  end
    
  this_link = self.new(:url => original_uri.to_s, 
                          :fingerprint => Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s)), 
                          :source_host => source_host,
                          :linktype => IMAGE)
                              
  # set host and path - mainly just for aggregation purposes
  if(!original_uri.host.blank?)
    this_link.host = original_uri.host.downcase
  end
  if(!original_uri.path.blank?)
    this_link.path = CGI.unescape(original_uri.path)
  end
  this_link.save
  return this_link        
end

+ (Object) find_or_create_by_linked_url(linked_url, source_host)

this is meant to be called when parsing a piece of content for items it links out to from its content.



276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
# File 'app/models/link.rb', line 276

def self.find_or_create_by_linked_url(linked_url,source_host)
  # make sure the URL is valid format
  begin
    original_uri = URI.parse(linked_url)
  rescue
    return nil
  end
  
  # is this a /wiki/Image:blah or /wiki/File:blah link? - then return nothing - it's ignored
  if(original_uri.path =~ /^\/wiki\/File:.*/ or original_uri.path =~ /^\/wiki\/Image:(.*)/)
    return ''
  end
  
  # explicitly ignore callto: links
  if(original_uri.scheme.blank?)
    original_uri.scheme = 'http'
  elsif(original_uri.class.name == 'URI::Generic')
    return nil
  end

  # is this a relative url? (no scheme/no host)- so attach the source_host and http
  # to it, to see if that matches an original URL that we have
  if(!original_uri.is_a?(URI::MailTo))
    if(original_uri.host.blank?) 
      # wiki link exception inside existing create articles that we still have
      if(original_uri.path =~ %r{^/wiki/} and source_host == 'create.extension.org')
        original_uri.host = 'cop.extension.org'
      else
        original_uri.host = source_host 
      end
    end
  end
  
  # for comparison purposes - we need to drop the fragment - the caller is going to
  # need to maintain the fragment when they get an URI back from this class.
  if(!original_uri.fragment.blank?)
    original_uri.fragment = nil
  end
  
  # check both the fingerprint and alternate_fingerprint and alias_fingerprint
  original_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s.downcase))    
  if(this_link = self.where("fingerprint = ? or alternate_fingerprint = ? or alias_fingerprint = ?",original_uri_fingerprint,original_uri_fingerprint,original_uri_fingerprint).first)
    return this_link
  end
    
  # create it - if host matches source_host and we want to identify this as "wanted" - then make it wanted else - call it external
  this_link = self.new(:source_host => source_host)
  # check to see if this is a migrated url
  if(self.is_copwiki?(original_uri.host) and migrated_url = MigratedUrl.find_by_alias_url_fingerprint(original_uri_fingerprint))
    begin 
      target_uri = URI.parse(migrated_url.target_url)
      target_url_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(target_uri.to_s.downcase))
      this_link.url = target_uri.to_s
      this_link.fingerprint = target_url_fingerprint
      this_link.alias_url = original_uri.to_s
      this_link.alias_fingerprint = original_uri_fingerprint
      this_link.linktype = WANTED
      # set host and path - mainly just for aggregation purposes
      if(!target_uri.host.blank?)
        this_link.host = target_uri.host.downcase
      end
      if(!target_uri.path.blank?)
        this_link.path = CGI.unescape(target_uri.path)
      end
    rescue
      return nil
    end
  elsif(self.is_create?(original_uri.host) and migrated_url = MigratedUrl.find_by_target_url_fingerprint(original_uri_fingerprint))
    begin 
      alias_uri = URI.parse(migrated_url.alias_url)
      alias_url_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alias_uri.to_s.downcase))
      this_link.alias_url = alias_uri.to_s
      this_link.alias_fingerprint = alias_url_fingerprint
      this_link.url = original_uri.to_s
      this_link.fingerprint = original_uri_fingerprint
      this_link.linktype = WANTED
      # set host and path - mainly just for aggregation purposes
      if(!original_uri.host.blank?)
        this_link.host = original_uri.host.downcase
      end
      if(!original_uri.path.blank?)
        this_link.path = CGI.unescape(original_uri.path)
      end
    rescue
      return nil
    end  
  else
    this_link.url = original_uri.to_s
    this_link.fingerprint = original_uri_fingerprint
      
    if(original_uri.is_a?(URI::MailTo))
      this_link.linktype = MAILTO
    elsif(self.is_create?(source_host) and self.is_voldemort?(original_uri.host) and original_uri.path =~ %r{^/sites/default/files/.*})
      # exemption for create and directfile links
      this_link.linktype = DIRECTFILE
    elsif(self.is_create?(source_host) and original_uri.path =~ %r{^/taxonomy/term/(\d+)})
      # exemption for create and links to taxonomy terms
      this_link.linktype = CATEGORY
    elsif(original_uri.path =~ %r{^/wiki/Category:.*})  
      this_link.linktype = CATEGORY
    elsif(self.is_create?(source_host) and self.is_voldemort?(original_uri.host) and original_uri.path =~ %r{^/mediawiki/.*})  
      this_link.linktype = DIRECTFILE
    elsif(self.is_create?(source_host) and self.is_voldemort?(original_uri.host) and original_uri.path =~ %r{^/learninglessons/.*})  
      this_link.linktype = DIRECTFILE
    elsif(original_uri.host == source_host)
      this_link.linktype = WANTED
    elsif(self.is_copwiki?(original_uri.host))
      # host is cop.extension.org, doesn't match the above and wasn't migrated, call it wanted
      this_link.linktype = WANTED
    elsif(original_uri.host.downcase == 'extension.org' or original_uri.host.downcase =~ /\.extension\.org$/)
      # host is extension
      this_link.linktype = LOCAL
    else
      this_link.linktype = EXTERNAL      
    end
      
    # set host and path - mainly just for aggregation purposes
    if(!original_uri.host.blank?)
      this_link.host = original_uri.host.downcase
    end
    if(!original_uri.path.blank?)
      this_link.path = CGI.unescape(original_uri.path)
    end
  end
  
  this_link.save
  return this_link        
end

+ (Boolean) is_copwiki?(host)

Returns:

  • (Boolean)


69
70
71
# File 'app/models/link.rb', line 69

def self.is_copwiki?(host)
  (host == 'cop.extension.org' or host == 'cop.demo.extension.org')
end

+ (Boolean) is_create?(host)

Returns:

  • (Boolean)


65
66
67
# File 'app/models/link.rb', line 65

def self.is_create?(host)
  (host == 'create.extension.org' or host == 'create.demo.extension.org')
end

+ (Boolean) is_voldemort?(host)

note to the future humorless, the www site is currently (as of this commit) the extension.org site that “has no name” (and multiple attempts in the staff to attempt to give it a name) - so in an effort to encapsulate something that needs to resolve to “www” - I called it voldemort. <jayoung>

Returns:

  • (Boolean)


94
95
96
# File 'app/models/link.rb', line 94

def self.is_voldemort?(host)
  self.is_create?(host) or self.is_copwiki?(host) or self.is_www?(host)
end

+ (Boolean) is_www?(host)

Returns:

  • (Boolean)


73
74
75
# File 'app/models/link.rb', line 73

def self.is_www?(host)
  (host == 'www.extension.org' or host == 'www.demo.extension.org')
end

+ (Object) linktype_to_description(linktype)



557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
# File 'app/models/link.rb', line 557

def self.linktype_to_description(linktype)
  case linktype
  when WANTED
    'wanted'
  when INTERNAL
    'internal'
  when EXTERNAL
    'external'
  when MAILTO
    'mailto'
  when CATEGORY
    'category'
  when DIRECTFILE
    'directfile'
  when LOCAL
    'local'
  when IMAGE
    'image'
  else
    'unknown'
  end
end

Instance Method Details

- (Object) change_alternate_url



174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'app/models/link.rb', line 174

def change_alternate_url    
  if(self.page.alternate_source_url != self.page.source_url)
    begin 
      alternate_source_uri = URI.parse(page.alternate_source_url)
      alternate_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alternate_source_uri.to_s.downcase))
    rescue
      # do nothing
    end
  end
  
  if(alternate_source_uri)
    self.alternate_url = alternate_source_uri.to_s
    self.alternate_fingerprint = alternate_source_uri_fingerprint
    self.save
  end
end

- (Object) change_to_wanted



165
166
167
168
169
170
171
172
# File 'app/models/link.rb', line 165

def change_to_wanted  
  if(self.linktype == INTERNAL)
    self.update_attribute(:linktype,WANTED)
    self.linkedpages.each do |linked_page|
      linked_page.store_content # parses links and images again and saves it.
    end
  end
end

- (Object) check_url(options = {})



459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
# File 'app/models/link.rb', line 459

def check_url(options = {})
  save = (!options[:save].nil? ? options[:save] : true)
  force_error_check = (!options[:force_error_check].nil? ? options[:force_error_check] : false)
  make_get_request = (!options[:make_get_request].nil? ? options[:make_get_request] : false)
  check_again_with_get = (!options[:check_again_with_get].nil? ? options[:check_again_with_get] : true)
  
  return if(!force_error_check and self.error_count >= MAX_ERROR_COUNT)
  
  self.last_check_at = Time.zone.now
  result = self.class.check_url(self.url,make_get_request)
  # make get request if responded, and response code was '404' and we didn't initially make a get request
  if(result[:responded] and !make_get_request and check_again_with_get and (result[:code] =='404' or result[:code] =='405' or result[:code] =='403'))
    result = self.class.check_url(self.url,true)
  end
    
  if(result[:responded])
    self.last_check_response = true
    self.last_check_information = {:response_headers => result[:response].to_hash}
    self.last_check_code = result[:code]
    if(result[:code] == '200')
      self.status = OK
      self.last_check_status = OK
      self.error_count = 0
    elsif(result[:code] == '301' or result[:code] == '302' or result[:code] == '303' or result[:code] == '307')
      self.status = OK_REDIRECT
      self.last_check_status = OK_REDIRECT
      self.error_count = 0
    else
      self.error_count += 1
      if(self.error_count >= MAX_WARNING_COUNT)
        self.status = BROKEN
      else
        self.status = WARNING
      end
      self.last_check_status = BROKEN
    end
  elsif(result[:ignored])
    self.last_check_response = false
    self.status = IGNORED
    self.last_check_status = IGNORED
  else
    self.last_check_response = false
    self.last_check_information = {:error => result[:error]}
    self.error_count += 1
    if(self.error_count >= MAX_WARNING_COUNT)
      self.status = BROKEN
    else
      self.status = WARNING
    end
    self.last_check_status = BROKEN
  end
  self.save
  return result
end

- (Object) href_url



122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'app/models/link.rb', line 122

def href_url
  default_url_options[:host] = Settings.urlwriter_host
  default_url_options[:protocol] = Settings.urlwriter_protocol
  if(default_port = Settings.urlwriter_port)
   default_url_options[:port] = default_port
  end
  
  case self.linktype
  when WANTED
    return ''
  when INTERNAL
    self.page.href_url
  when EXTERNAL
    self.url
  when LOCAL
    self.url
  when MAILTO
    self.url
  when CATEGORY
    if(self.path =~ /^\/wiki\/Category\:(.+)/)
       = $1.gsub(/_/, ' ')
      category_tag_index_url(:content_tag => Tag.url_display_name())
    elsif(self.is_create? and self.path =~ %r{^/taxonomy/term/(\d+)})
      # special case for Create taxonomy terms
      if(taxonomy_term = CreateTaxonomyTerm.find($1))
        category_tag_index_url(:content_tag => Tag.url_display_name(taxonomy_term.name))
      else
        ''
      end
    else
      ''
    end
  when DIRECTFILE
    self.path
  when IMAGE
    if(self.is_copwiki_or_create?)
      "http://www.extension.org#{self.path}"
    else
      self.url
    end
  end
end

- (Boolean) is_copwiki?

Returns:

  • (Boolean)


81
82
83
# File 'app/models/link.rb', line 81

def is_copwiki?
  self.class.is_copwiki?(self.host)
end

- (Boolean) is_copwiki_or_create?

Returns:

  • (Boolean)


85
86
87
# File 'app/models/link.rb', line 85

def is_copwiki_or_create?
  self.class.is_create?(self.host) or self.class.is_copwiki?(self.host)
end

- (Boolean) is_create?

Returns:

  • (Boolean)


77
78
79
# File 'app/models/link.rb', line 77

def is_create?
  self.class.is_create?(self.host)
end

- (Object) reset_status



514
515
516
# File 'app/models/link.rb', line 514

def reset_status
  self.update_attributes(:status => nil, :error_count => 0, :last_check_at => nil, :last_check_status => nil, :last_check_response => nil, :last_check_code => nil, :last_check_information => nil)
end

- (Object) status_to_s



101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'app/models/link.rb', line 101

def status_to_s
  if(self.status.blank?)
    return 'Not yet checked'
  end
  
  case self.status
  when OK
    return 'OK'
  when OK_REDIRECT
    return 'Redirect'
  when WARNING
    return 'Warning'
  when BROKEN
    return 'Broken'
  when IGNORED
    return 'Ignored'
  else
    return 'Unknown'
  end
end