Class: DocParser::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/docparser/document.rb

Overview

The Document class loads and parses the files.

See Also:

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(filename: nil, encoding: 'utf-8', parser: nil) ⇒ Document

Returns a new instance of Document.


22
23
24
25
26
27
28
29
30
# File 'lib/docparser/document.rb', line 22

def initialize(filename: nil, encoding: 'utf-8', parser: nil)
  @logger = Log4r::Logger.new('docparser::document')
  @logger.debug { "Parsing #{filename}" }
  @encoding = encoding
  @parser = parser
  @filename = filename
  @results = Array.new(@parser.outputs ? @parser.outputs.length : 0) { [] }
  read_file
end

Instance Attribute Details

#docNokogiri::HTML::Document (readonly)

Returns a reference to the Nokogiri document.

Returns:

  • (Nokogiri::HTML::Document)

    a reference to the Nokogiri document


11
12
13
# File 'lib/docparser/document.rb', line 11

def doc
  @doc
end

#encodingString (readonly)

Returns the encoding of the document.

Returns:

  • (String)

    the encoding of the document


14
15
16
# File 'lib/docparser/document.rb', line 14

def encoding
  @encoding
end

#filenameString (readonly)

Returns the filename of the current document.

Returns:

  • (String)

    the filename of the current document


8
9
10
# File 'lib/docparser/document.rb', line 8

def filename
  @filename
end

#htmlString (readonly)

Returns the source of the document.

Returns:

  • (String)

    the source of the document


20
21
22
# File 'lib/docparser/document.rb', line 20

def html
  @html
end

#resultsArray (readonly)

Returns the results from this document.

Returns:

  • (Array)

    the results from this document


17
18
19
# File 'lib/docparser/document.rb', line 17

def results
  @results
end

Instance Method Details

#add_row(*row, output: 0) ⇒ Object

Adds a row to an output


33
34
35
36
37
# File 'lib/docparser/document.rb', line 33

def add_row(*row, output: 0)
  output = @parser.outputs.index(output) if output.is_a? Output
  @logger.debug { "#{filename}: Adding row #{row.flatten}" }
  results[output] << row.flatten
end

#parse!(&block) ⇒ Array

Parses the document

Returns:

  • (Array)

    containing the parse results


73
74
75
76
# File 'lib/docparser/document.rb', line 73

def parse!(&block)
  instance_exec(&block)
  results
end

#regexp(regexp) ⇒ Object

Matches the HTML source using a regular expression


67
68
69
# File 'lib/docparser/document.rb', line 67

def regexp(regexp)
  html.match(regexp)
end

#titleString

Extracts the document title

Returns:

  • (String)

    the title of the document


41
42
43
# File 'lib/docparser/document.rb', line 41

def title
  @title ||= xpath_content('//head/title')
end

#xpath(query) ⇒ Object Also known as: css

Executes a xpath query


46
47
48
49
50
51
52
53
# File 'lib/docparser/document.rb', line 46

def xpath(query)
  res = @doc.search(query)
  if block_given?
    res.each { |el| yield el }
  else
    res
  end
end

#xpath_content(query) ⇒ String Also known as: css_content

Executes a xpath query and returns the content

Returns:

  • (String)

    the content of the HTML node


57
58
59
60
61
62
63
64
# File 'lib/docparser/document.rb', line 57

def xpath_content(query)
  first = @doc.search(query).first
  if first.nil?
    nil
  else
    first.content
  end
end