Module: Tabledata::Detection

Defined in:
lib/tabledata/detection.rb

Overview

This module provides methods to detect the encoding of a text file. Currently the detection is limited to utf-8, windows-1252 and macroman in western european languages.

Constant Summary collapse

UnlikelyCharsWin1252 =
"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC\xAD" \
"\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB" \
"\xBC\xBD\xBE\xBF\xD7\xF7"
UnlikelyCharsIso8859_1 =
""
UnlikelyCharsMacRoman =
""
UmlautsMac =
"äöü".encode(Encoding::MacRoman).force_encoding(Encoding::BINARY)
UmlautsWin =
"äöü".encode(Encoding::Windows_1252).force_encoding(Encoding::BINARY)
DiacritsMac =
"âàéèô".encode(Encoding::MacRoman).force_encoding(Encoding::BINARY)
DiacritsWin =
"âàéèô".encode(Encoding::Windows_1252).force_encoding(Encoding::BINARY)

Class Method Summary collapse

Class Method Details

.file_type_from_path(path) ⇒ Object


71
72
73
74
75
76
77
78
# File 'lib/tabledata/detection.rb', line 71

def file_type_from_path(path)
  case path
    when /\.csv$/ then :csv
    when /\.xls$/ then :xls
    when /\.xlsx$/ then :xlsx
    else raise InvalidFileType, "Unknown file format for path #{path.inspect}"
  end
end

.force_guessed_encoding!(string) ⇒ Object


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/tabledata/detection.rb', line 29

def force_guessed_encoding!(string)
  return string if string.force_encoding(Encoding::UTF_8).valid_encoding?
  string.force_encoding(Encoding::BINARY)

  # check for non-mapped codepoints
  possible_encodings = [Encoding::Windows_1252, Encoding::ISO8859_15, Encoding::MacRoman]
  possible_encodings.delete(Encoding::ISO8859_15) if string =~ /[\x80-\x9f]/n
  possible_encodings.delete(Encoding::Windows_1252) if string =~ /[\x81\x8D\x8F\x90\x9D]/n
  return string.force_encoding(possible_encodings.first) if possible_encodings.size == 1

  # # check for occurrences of characters with weighted expectancy
  # # e.g. a "§" is quite unlikely
  # win = string[0,10_000].count(UnlikelyCharsWin1252)
  # iso = string[0,10_000].count(UnlikelyCharsIso8859_1)
  # mac = string[0,10_000].count(UnlikelyCharsMacRoman)

  # Check occurrences of äöü
  case string[0,10_000].count(UmlautsMac) <=> string[0,10_000].count(UmlautsWin)
    when -1 then return string.force_encoding(Encoding::Windows_1252)
    when  1 then return string.force_encoding(Encoding::MacRoman)
  end

  # Check occurrences of âàéèô
  case string[0,10_000].count(DiacritsMac) <=> string[0,10_000].count(DiacritsWin)
    when -1 then return string.force_encoding(Encoding::Windows_1252)
    when  1 then return string.force_encoding(Encoding::MacRoman)
  end

  # Bias for Windows_1252
  string.force_encoding(Encoding::Windows_1252)
end

.guess_csv_delimiter(csv, out_of = [',',';']) ⇒ Object


65
66
67
68
69
# File 'lib/tabledata/detection.rb', line 65

def guess_csv_delimiter(csv, out_of=[',',';'])
  out_of = out_of.map { |delimiter| delimiter.encode(csv.encoding) }

  out_of.max_by { |delimiter| csv[0, 10_000].count(delimiter) }
end

.guess_encoding(string) ⇒ Object


61
62
63
# File 'lib/tabledata/detection.rb', line 61

def guess_encoding(string)
  force_guessed_encoding!(string.dup).encoding
end