Module: Builder::XChar

Defined in:
lib/builder/xchar.rb,
lib/builder/xchar.rb

Overview

:nodoc:

Constant Summary collapse

CP1252 =
{     # :nodoc:
  128 => 8364,    # euro sign
  130 => 8218,    # single low-9 quotation mark
  131 =>  402,    # latin small letter f with hook
  132 => 8222,    # double low-9 quotation mark
  133 => 8230,    # horizontal ellipsis
  134 => 8224,    # dagger
  135 => 8225,    # double dagger
  136 =>  710,    # modifier letter circumflex accent
  137 => 8240,    # per mille sign
  138 =>  352,    # latin capital letter s with caron
  139 => 8249,    # single left-pointing angle quotation mark
  140 =>  338,    # latin capital ligature oe
  142 =>  381,    # latin capital letter z with caron
  145 => 8216,    # left single quotation mark
  146 => 8217,    # right single quotation mark
  147 => 8220,    # left double quotation mark
  148 => 8221,    # right double quotation mark
  149 => 8226,    # bullet
  150 => 8211,    # en dash
  151 => 8212,    # em dash
  152 =>  732,    # small tilde
  153 => 8482,    # trade mark sign
  154 =>  353,    # latin small letter s with caron
  155 => 8250,    # single right-pointing angle quotation mark
  156 =>  339,    # latin small ligature oe
  158 =>  382,    # latin small letter z with caron
  159 =>  376,    # latin capital letter y with diaeresis
}
PREDEFINED =
{
  38 => '&',    # ampersand
  60 => '<',   # left angle bracket
  62 => '>',   # right angle bracket
}
VALID =
[
  0x9, 0xA, 0xD,
  (0x20..0xD7FF), 
  (0xE000..0xFFFD),
  (0x10000..0x10FFFF)
]
REPLACEMENT_CHAR =
if String.method_defined?(:encode)
  "\uFFFD"
elsif $KCODE == 'UTF8'
  "\xEF\xBF\xBD"
else
  '*'
end
XML_PREDEFINED =
Regexp.new('[' +
  Builder::XChar::PREDEFINED.keys.pack('U*').force_encoding('utf-8') +
']')
INVALID_XML_CHAR =
Regexp.new('[^'+
  Builder::XChar::VALID.map { |item|
    case item
    when Integer
      [item].pack('U').force_encoding('utf-8')
    when Range
      [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
    end
  }.join +
']')
ENCODING_BINARY =
Encoding.find('BINARY')
ENCODING_UTF8 =
Encoding.find('UTF-8')
ENCODING_ISO1 =
Encoding.find('ISO-8859-1')

Class Method Summary collapse

Class Method Details

.encode(string) ⇒ Object

encode a string per XML rules



152
153
154
155
156
157
# File 'lib/builder/xchar.rb', line 152

def XChar.encode(string)
  unicode(string).
    tr(CP1252_DIFFERENCES, UNICODE_EQUIVALENT).
    gsub(INVALID_XML_CHAR, REPLACEMENT_CHAR).
    gsub(XML_PREDEFINED) {|c| PREDEFINED[c.ord]}
end

.unicode(string) ⇒ Object

convert a string to valid UTF-8, compensating for a number of common errors.



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/builder/xchar.rb', line 126

def XChar.unicode(string)
  if string.encoding == ENCODING_BINARY
    if string.ascii_only?
      string
    else
      string = string.clone.force_encoding(ENCODING_UTF8)
      if string.valid_encoding?
        string
      else
        string.encode(ENCODING_UTF8, ENCODING_ISO1)
      end
    end

  elsif string.encoding == ENCODING_UTF8
    if string.valid_encoding?
      string
    else
      string.encode(ENCODING_UTF8, ENCODING_ISO1)
    end

  else
    string.encode(ENCODING_UTF8)
  end
end