Class: DocParser::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/docparser/document.rb

Overview

The Document class loads and parses the files.

See Also:

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(filename: nil, encoding: 'utf-8', parser: nil) ⇒ Document


22
23
24
25
26
27
28
29
30
# File 'lib/docparser/document.rb', line 22

def initialize(filename: nil, encoding: 'utf-8', parser: nil)
  @logger = Log4r::Logger.new('docparser::document')
  @logger.debug { "Parsing #{filename}" }
  @encoding = encoding
  @parser = parser
  @filename = filename
  @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
  read_file
end

Instance Attribute Details

#docNokogiri::HTML::Document (readonly)


11
12
13
# File 'lib/docparser/document.rb', line 11

def doc
  @doc
end

#encodingString (readonly)


14
15
16
# File 'lib/docparser/document.rb', line 14

def encoding
  @encoding
end

#filenameString (readonly)


8
9
10
# File 'lib/docparser/document.rb', line 8

def filename
  @filename
end

#htmlString (readonly)


20
21
22
# File 'lib/docparser/document.rb', line 20

def html
  @html
end

#resultsArray (readonly)


17
18
19
# File 'lib/docparser/document.rb', line 17

def results
  @results
end

Instance Method Details

#add_row(*row, output: 0) ⇒ Object

Adds a row to an output


33
34
35
36
37
# File 'lib/docparser/document.rb', line 33

def add_row(*row, output: 0)
  output = @parser.outputs.index(output) if output.is_a? Output
  @logger.debug { "#{filename}: Adding row #{row.flatten}" }
  results[output] << row.flatten
end

#parse!(&block) ⇒ Array

Parses the document


73
74
75
76
# File 'lib/docparser/document.rb', line 73

def parse!(&block)
  instance_exec(&block)
  results
end

#regexp(regexp) ⇒ Object

Matches the HTML source using a regular expression


67
68
69
# File 'lib/docparser/document.rb', line 67

def regexp(regexp)
  html.match(regexp)
end

#titleString

Extracts the document title


41
42
43
# File 'lib/docparser/document.rb', line 41

def title
  @title ||= xpath_content('//head/title')
end

#xpath(query) ⇒ Object Also known as: css

Executes a xpath query


46
47
48
49
50
51
52
53
# File 'lib/docparser/document.rb', line 46

def xpath(query)
  res = @doc.search(query)
  if block_given?
    res.each { |el| yield el }
  else
    res
  end
end

#xpath_content(query) ⇒ String Also known as: css_content

Executes a xpath query and returns the content


57
58
59
60
61
62
63
64
# File 'lib/docparser/document.rb', line 57

def xpath_content(query)
  first = @doc.search(query).first
  if first.nil?
    nil
  else
    first.content
  end
end