Class: Link

Inherits:
ActiveRecord::Base
  • Object
show all
Includes:
ActionController::UrlWriter
Defined in:
app/models/link.rb

Constant Summary

WANTED =

link types

1
INTERNAL =
2
EXTERNAL =
3
MAILTO =
4
CATEGORY =
5
DIRECTFILE =
6
LOCAL =
7
IMAGE =
8
OK =

status codes

1
OK_REDIRECT =
2
WARNING =
3
BROKEN =
4
IGNORED =
5
MAX_WARNING_COUNT =

maximum number of times a broken link reports broken before warning goes to error

3
MAX_ERROR_COUNT =

maximum number of times we'll check a broken link before giving up

10

Class Method Summary (collapse)

Instance Method Summary (collapse)

Class Method Details

+ (Object) check_url(url, make_get_request = false)



503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
# File 'app/models/link.rb', line 503

def self.check_url(url,make_get_request=false)
  headers = {'User-Agent' => 'extension.org link verification'}
  # the URL should have likely already be validated, but let's do it again for good measure
  begin
    check_uri = URI.parse("#{url}")
  rescue Exception => exception
    return {:responded => false, :error => exception.message}
  end
  
  if(check_uri.scheme != 'http' and check_uri.scheme != 'https')
    return {:responded => false, :ignored => true}
  end
    
  # check it!
  begin
    response = nil
    http_connection = Net::HTTP.new(check_uri.host, check_uri.port)
    if(check_uri.scheme == 'https')
      # don't verify cert?
      http_connection.verify_mode = OpenSSL::SSL::VERIFY_NONE
      http_connection.use_ssl = true 
    end
    request_path = !check_uri.path.blank? ? check_uri.path : "/"
    if(!check_uri.query.blank?)
      request_path += "?" + check_uri.query
    end
      
    if(!make_get_request)
      response = http_connection.head(request_path,headers)   
    else
      response = http_connection.get(request_path,headers)   
    end
    {:responded => true, :code => response.code, :response => response}
  rescue Exception => exception
    return {:responded => false, :error => exception.message}
  end
end

+ (Object) count_by_linktype



562
563
564
565
566
567
568
569
# File 'app/models/link.rb', line 562

def self.count_by_linktype
  returnhash = {}
  linkcounts = Link.count(:group => :linktype)
  linkcounts.each do |linktype,count|
    returnhash[self.linktype_to_description(linktype)] = count
  end
  returnhash
end

+ (Object) create_from_page(page)



175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
# File 'app/models/link.rb', line 175

def self.create_from_page(page)
  if(page.source_url.blank?)
    return nil
  end
  
  # make sure the URL is valid format
  begin
    source_uri = URI.parse(page.source_url)
    source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(source_uri.to_s.downcase))
  rescue
    return nil
  end
  
  # special case for where the alternate != source_url
  if(page.alternate_source_url != page.source_url)
    begin 
      alternate_source_uri = URI.parse(page.alternate_source_url)
      alternate_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alternate_source_uri.to_s.downcase))
    rescue
      # do nothing
    end
  end  
  
  # specical case for create urls - does this have an alias_uri?
  if(page.page_source and page.page_source.name == 'create')    
    if(!page.old_source_url.blank?)
      begin 
        old_source_uri = URI.parse(page.old_source_url)
        old_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(old_source_uri.to_s.downcase))
      rescue
        # do nothing
      end
    elsif(migrated_url = MigratedUrl.find_by_target_url_fingerprint(source_uri_fingerprint))
      old_source_uri = migrated_url.alias_url
      old_source_uri_fingerprint = migrated_url = migrated_url.alias_url_fingerprint
    end
  end
  
  find_condition = "fingerprint = '#{source_uri_fingerprint}'"
  if(alternate_source_uri)
    find_condition += " OR alternate_fingerprint = '#{alternate_source_uri_fingerprint}'"
  end
  if(old_source_uri)
    find_condition += " OR alias_fingerprint = '#{old_source_uri_fingerprint}'"
  end
  
    
  if(this_link = self.where(find_condition).first)
    # this was a wanted link - we need to update the link now - and kick off the process of updating everything
    # that links to this page
    this_link.update_attributes(:page => page, :linktype => INTERNAL)
    this_link.linkedpages.each do |linked_page|
      linked_page.store_content # parses links and images again and saves it.
    end
  else    
    this_link = self.new(:page => page, :url => source_uri.to_s, :fingerprint => source_uri_fingerprint)
    
    if(alternate_source_uri)
      this_link.alternate_url = alternate_source_uri.to_s
      this_link.alternate_fingerprint = alternate_source_uri_fingerprint
    end
    
    if(old_source_uri)
      this_link.alias_url = old_source_uri.to_s
      this_link.alias_fingerprint = old_source_uri_fingerprint
    end
      
    this_link.source_host = source_uri.host
    this_link.linktype = INTERNAL
  
    # set host and path - mainly just for aggregation purposes
    if(!source_uri.host.blank?)
      this_link.host = source_uri.host
    end
    if(!source_uri.path.blank?)
      this_link.path = CGI.unescape(source_uri.path)
    end
    this_link.save
  end
  return this_link

  return returnlink
end

+ (Object) find_or_create_by_image_reference(image_reference, source_host)

this is meant to be called when parsing a piece of content for items it links out to from its content.



390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# File 'app/models/link.rb', line 390

def self.find_or_create_by_image_reference(image_reference,source_host)
  # make sure the URL is valid format
  begin
    original_uri = URI.parse(image_reference)
  rescue
    return nil
  end  

  if(original_uri.scheme == 'data')
    return nil
  end


  if(original_uri.host.blank?) 
    # wiki link exception inside existing create articles that we still have
    if(original_uri.path =~ %r{^/mediawiki/} and source_host == 'create.extension.org')
      original_uri.host = 'cop.extension.org'
    else
      original_uri.host = source_host 
    end
  end
  original_uri.scheme = 'http' if(original_uri.scheme.blank?)

  # for comparison purposes - we need to drop the fragment - the caller is going to
  # need to maintain the fragment when they get an URI back from this class.
  if(!original_uri.fragment.blank?)
    original_uri.fragment = nil
  end
  
  if(this_link = self.find_by_fingerprint(Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s))))
    if(this_link.linktype != IMAGE)
      this_link.update_attribute(:linktype, IMAGE)
    end
    return this_link
  end
    
  this_link = self.new(:url => original_uri.to_s, 
                          :fingerprint => Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s)), 
                          :source_host => source_host,
                          :linktype => IMAGE)
                              
  # set host and path - mainly just for aggregation purposes
  if(!original_uri.host.blank?)
    this_link.host = original_uri.host.downcase
  end
  if(!original_uri.path.blank?)
    this_link.path = CGI.unescape(original_uri.path)
  end
  this_link.save
  return this_link        
end

+ (Object) find_or_create_by_linked_url(linked_url, source_host)

this is meant to be called when parsing a piece of content for items it links out to from its content.



260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
# File 'app/models/link.rb', line 260

def self.find_or_create_by_linked_url(linked_url,source_host)
  # make sure the URL is valid format
  begin
    original_uri = URI.parse(linked_url)
  rescue
    return nil
  end
  
  # is this a /wiki/Image:blah or /wiki/File:blah link? - then return nothing - it's ignored
  if(original_uri.path =~ /^\/wiki\/File:.*/ or original_uri.path =~ /^\/wiki\/Image:(.*)/)
    return ''
  end
  
  # explicitly ignore callto: links
  if(original_uri.scheme.blank?)
    original_uri.scheme = 'http'
  elsif(original_uri.class.name == 'URI::Generic')
    return nil
  end

  # is this a relative url? (no scheme/no host)- so attach the source_host and http
  # to it, to see if that matches an original URL that we have
  if(!original_uri.is_a?(URI::MailTo))
    if(original_uri.host.blank?) 
      # wiki link exception inside existing create articles that we still have
      if(original_uri.path =~ %r{^/wiki/} and source_host == 'create.extension.org')
        original_uri.host = 'cop.extension.org'
      else
        original_uri.host = source_host 
      end
    end
  end
  
  # for comparison purposes - we need to drop the fragment - the caller is going to
  # need to maintain the fragment when they get an URI back from this class.
  if(!original_uri.fragment.blank?)
    original_uri.fragment = nil
  end
  
  # check both the fingerprint and alternate_fingerprint and alias_fingerprint
  original_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(original_uri.to_s.downcase))    
  if(this_link = self.where("fingerprint = ? or alternate_fingerprint = ? or alias_fingerprint = ?",original_uri_fingerprint,original_uri_fingerprint,original_uri_fingerprint).first)
    return this_link
  end
    
  # create it - if host matches source_host and we want to identify this as "wanted" - then make it wanted else - call it external
  this_link = self.new(:source_host => source_host)
  # check to see if this is a migrated url
  if(self.is_copwiki?(original_uri.host) and migrated_url = MigratedUrl.find_by_alias_url_fingerprint(original_uri_fingerprint))
    begin 
      target_uri = URI.parse(migrated_url.target_url)
      target_url_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(target_uri.to_s.downcase))
      this_link.url = target_uri.to_s
      this_link.fingerprint = target_url_fingerprint
      this_link.alias_url = original_uri.to_s
      this_link.alias_fingerprint = original_uri_fingerprint
      this_link.linktype = WANTED
      # set host and path - mainly just for aggregation purposes
      if(!target_uri.host.blank?)
        this_link.host = target_uri.host.downcase
      end
      if(!target_uri.path.blank?)
        this_link.path = CGI.unescape(target_uri.path)
      end
    rescue
      return nil
    end
  elsif(self.is_create?(original_uri.host) and migrated_url = MigratedUrl.find_by_target_url_fingerprint(original_uri_fingerprint))
    begin 
      alias_uri = URI.parse(migrated_url.alias_url)
      alias_url_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alias_uri.to_s.downcase))
      this_link.alias_url = alias_uri.to_s
      this_link.alias_fingerprint = alias_url_fingerprint
      this_link.url = original_uri.to_s
      this_link.fingerprint = original_uri_fingerprint
      this_link.linktype = WANTED
      # set host and path - mainly just for aggregation purposes
      if(!original_uri.host.blank?)
        this_link.host = original_uri.host.downcase
      end
      if(!original_uri.path.blank?)
        this_link.path = CGI.unescape(original_uri.path)
      end
    rescue
      return nil
    end  
  else
    this_link.url = original_uri.to_s
    this_link.fingerprint = original_uri_fingerprint
      
    if(original_uri.is_a?(URI::MailTo))
      this_link.linktype = MAILTO
    elsif(self.is_create?(source_host) and original_uri.path =~ %r{^/sites/default/files/.*})
      # exemption for create and directfile links
      this_link.linktype = DIRECTFILE
    elsif(self.is_create?(source_host) and original_uri.path =~ %r{^/taxonomy/term/(\d+)})
      # exemption for create and links to taxonomy terms
      this_link.linktype = CATEGORY
    elsif(original_uri.path =~ %r{^/wiki/Category:.*})  
      this_link.linktype = CATEGORY
    elsif(original_uri.path =~ %r{^/mediawiki/.*})  
      this_link.linktype = DIRECTFILE
    elsif(original_uri.path =~ %r{^/learninglessons/.*})  
      this_link.linktype = DIRECTFILE
    elsif(original_uri.host == source_host)
      this_link.linktype = WANTED
    elsif(self.is_copwiki?(original_uri.host))
      # host is cop.extension.org, doesn't match the above and wasn't migrated, call it wanted
      this_link.linktype = WANTED
    elsif(original_uri.host.downcase == 'extension.org' or original_uri.host.downcase =~ /\.extension\.org$/)
      # host is extension
      this_link.linktype = LOCAL
    else
      this_link.linktype = EXTERNAL      
    end
      
    # set host and path - mainly just for aggregation purposes
    if(!original_uri.host.blank?)
      this_link.host = original_uri.host.downcase
    end
    if(!original_uri.path.blank?)
      this_link.path = CGI.unescape(original_uri.path)
    end
  end
  
  this_link.save
  return this_link        
end

+ (Boolean) is_copwiki?(host)

Returns:

  • (Boolean)


69
70
71
# File 'app/models/link.rb', line 69

def self.is_copwiki?(host)
  (host == 'cop.extension.org' or host == 'cop.demo.extension.org')
end

+ (Boolean) is_create?(host)

Returns:

  • (Boolean)


65
66
67
# File 'app/models/link.rb', line 65

def self.is_create?(host)
  (host == 'create.extension.org' or host == 'create.demo.extension.org')
end

+ (Object) linktype_to_description(linktype)



541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
# File 'app/models/link.rb', line 541

def self.linktype_to_description(linktype)
  case linktype
  when WANTED
    'wanted'
  when INTERNAL
    'internal'
  when EXTERNAL
    'external'
  when MAILTO
    'mailto'
  when CATEGORY
    'category'
  when DIRECTFILE
    'directfile'
  when LOCAL
    'local'
  else
    'unknown'
  end
end

Instance Method Details

- (Object) change_alternate_url



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# File 'app/models/link.rb', line 158

def change_alternate_url    
  if(self.page.alternate_source_url != self.page.source_url)
    begin 
      alternate_source_uri = URI.parse(page.alternate_source_url)
      alternate_source_uri_fingerprint = Digest::SHA1.hexdigest(CGI.unescape(alternate_source_uri.to_s.downcase))
    rescue
      # do nothing
    end
  end
  
  if(alternate_source_uri)
    self.alternate_url = alternate_source_uri.to_s
    self.alternate_fingerprint = alternate_source_uri_fingerprint
    self.save
  end
end

- (Object) change_to_wanted



149
150
151
152
153
154
155
156
# File 'app/models/link.rb', line 149

def change_to_wanted  
  if(self.linktype == INTERNAL)
    self.update_attribute(:linktype,WANTED)
    self.linkedpages.each do |linked_page|
      linked_page.store_content # parses links and images again and saves it.
    end
  end
end

- (Object) check_url(options = {})



443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
# File 'app/models/link.rb', line 443

def check_url(options = {})
  save = (!options[:save].nil? ? options[:save] : true)
  force_error_check = (!options[:force_error_check].nil? ? options[:force_error_check] : false)
  make_get_request = (!options[:make_get_request].nil? ? options[:make_get_request] : false)
  check_again_with_get = (!options[:check_again_with_get].nil? ? options[:check_again_with_get] : true)
  
  return if(!force_error_check and self.error_count >= MAX_ERROR_COUNT)
  
  self.last_check_at = Time.zone.now
  result = self.class.check_url(self.url,make_get_request)
  # make get request if responded, and response code was '404' and we didn't initially make a get request
  if(result[:responded] and !make_get_request and check_again_with_get and (result[:code] =='404' or result[:code] =='405' or result[:code] =='403'))
    result = self.class.check_url(self.url,true)
  end
    
  if(result[:responded])
    self.last_check_response = true
    self.last_check_information = {:response_headers => result[:response].to_hash}
    self.last_check_code = result[:code]
    if(result[:code] == '200')
      self.status = OK
      self.last_check_status = OK
      self.error_count = 0
    elsif(result[:code] == '301' or result[:code] == '302' or result[:code] == '303' or result[:code] == '307')
      self.status = OK_REDIRECT
      self.last_check_status = OK_REDIRECT
      self.error_count = 0
    else
      self.error_count += 1
      if(self.error_count >= MAX_WARNING_COUNT)
        self.status = BROKEN
      else
        self.status = WARNING
      end
      self.last_check_status = BROKEN
    end
  elsif(result[:ignored])
    self.last_check_response = false
    self.status = IGNORED
    self.last_check_status = IGNORED
  else
    self.last_check_response = false
    self.last_check_information = {:error => result[:error]}
    self.error_count += 1
    if(self.error_count >= MAX_WARNING_COUNT)
      self.status = BROKEN
    else
      self.status = WARNING
    end
    self.last_check_status = BROKEN
  end
  self.save
  return result
end

- (Object) href_url



106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# File 'app/models/link.rb', line 106

def href_url
  default_url_options[:host] = AppConfig.get_url_host
  default_url_options[:protocol] = AppConfig.get_url_protocol
  if(default_port = AppConfig.get_url_port)
   default_url_options[:port] = default_port
  end
  
  case self.linktype
  when WANTED
    return ''
  when INTERNAL
    self.page.href_url
  when EXTERNAL
    self.url
  when LOCAL
    self.url
  when MAILTO
    self.url
  when CATEGORY
    if(self.path =~ /^\/wiki\/Category\:(.+)/)
       = $1.gsub(/_/, ' ')
      category_tag_index_url(:content_tag => Tag.url_display_name())
    elsif(self.is_create? and self.path =~ %r{^/taxonomy/term/(\d+)})
      # special case for Create taxonomy terms
      if(taxonomy_term = CreateTaxonomyTerm.find($1))
        category_tag_index_url(:content_tag => Tag.url_display_name(taxonomy_term.name))
      else
        ''
      end
    else
      ''
    end
  when DIRECTFILE
    self.path
  when IMAGE
    if(self.is_copwiki_or_create?)
      "http://www.extension.org#{self.path}"
    else
      self.url
    end
  end
end

- (Boolean) is_copwiki?

Returns:

  • (Boolean)


77
78
79
# File 'app/models/link.rb', line 77

def is_copwiki?
  self.class.is_copwiki?(self.host)
end

- (Boolean) is_copwiki_or_create?

Returns:

  • (Boolean)


81
82
83
# File 'app/models/link.rb', line 81

def is_copwiki_or_create?
  self.class.is_create?(self.host) or self.class.is_copwiki?(self.host)
end

- (Boolean) is_create?

Returns:

  • (Boolean)


73
74
75
# File 'app/models/link.rb', line 73

def is_create?
  self.class.is_create?(self.host)
end

- (Object) reset_status



498
499
500
# File 'app/models/link.rb', line 498

def reset_status
  self.update_attributes(:status => nil, :error_count => 0, :last_check_at => nil, :last_check_status => nil, :last_check_response => nil, :last_check_code => nil, :last_check_information => nil)
end

- (Object) status_to_s



85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# File 'app/models/link.rb', line 85

def status_to_s
  if(self.status.blank?)
    return 'Not yet checked'
  end
  
  case self.status
  when OK
    return 'OK'
  when OK_REDIRECT
    return 'Redirect'
  when WARNING
    return 'Warning'
  when BROKEN
    return 'Broken'
  when IGNORED
    return 'Ignored'
  else
    return 'Unknown'
  end
end