| [1329] | 1 | diff --git a/activesupport/lib/active_support/multibyte.rb b/activesupport/lib/active_support/multibyte.rb | 
|---|
 | 2 | index 65a96af..b6354ee 100644 | 
|---|
 | 3 | --- a/activesupport/lib/active_support/multibyte.rb | 
|---|
 | 4 | +++ b/activesupport/lib/active_support/multibyte.rb | 
|---|
 | 5 | @@ -1,9 +1,5 @@ | 
|---|
 | 6 |  # encoding: utf-8 | 
|---|
 | 7 |   | 
|---|
 | 8 | -require 'active_support/multibyte/chars' | 
|---|
 | 9 | -require 'active_support/multibyte/exceptions' | 
|---|
 | 10 | -require 'active_support/multibyte/unicode_database' | 
|---|
 | 11 | - | 
|---|
 | 12 |  module ActiveSupport #:nodoc: | 
|---|
 | 13 |    module Multibyte | 
|---|
 | 14 |      # A list of all available normalization forms. See http://www.unicode.org/reports/tr15/tr15-29.html for more | 
|---|
 | 15 | @@ -27,7 +23,35 @@ module ActiveSupport #:nodoc: | 
|---|
 | 16 |      # | 
|---|
 | 17 |      # Example: | 
|---|
 | 18 |      #   ActiveSupport::Multibyte.proxy_class = CharsForUTF32 | 
|---|
 | 19 | -    mattr_accessor :proxy_class | 
|---|
 | 20 | -    self.proxy_class = ActiveSupport::Multibyte::Chars | 
|---|
 | 21 | +    def self.proxy_class=(klass) | 
|---|
 | 22 | +      @proxy_class = klass | 
|---|
 | 23 | +    end | 
|---|
 | 24 | + | 
|---|
 | 25 | +    # Returns the currect proxy class | 
|---|
 | 26 | +    def self.proxy_class | 
|---|
 | 27 | +      @proxy_class ||= ActiveSupport::Multibyte::Chars | 
|---|
 | 28 | +    end | 
|---|
 | 29 | + | 
|---|
 | 30 | +    # Regular expressions that describe valid byte sequences for a character | 
|---|
 | 31 | +    VALID_CHARACTER = { | 
|---|
 | 32 | +      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | 
|---|
 | 33 | +      'UTF-8' => /\A(?: | 
|---|
 | 34 | +                  [\x00-\x7f]                                         | | 
|---|
 | 35 | +                  [\xc2-\xdf] [\x80-\xbf]                             | | 
|---|
 | 36 | +                  \xe0        [\xa0-\xbf] [\x80-\xbf]                 | | 
|---|
 | 37 | +                  [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]                 | | 
|---|
 | 38 | +                  \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf]     | | 
|---|
 | 39 | +                  [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf]     | | 
|---|
 | 40 | +                  \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf])\z /xn, | 
|---|
 | 41 | +      # Quick check for valid Shift-JIS characters, disregards the odd-even pairing | 
|---|
 | 42 | +      'Shift_JIS' => /\A(?: | 
|---|
 | 43 | +                  [\x00-\x7e \xa1-\xdf]                                     | | 
|---|
 | 44 | +                  [\x81-\x9f \xe0-\xef] [\x40-\x7e \x80-\x9e \x9f-\xfc])\z /xn | 
|---|
 | 45 | +    } | 
|---|
 | 46 |    end | 
|---|
 | 47 |  end | 
|---|
 | 48 | + | 
|---|
 | 49 | +require 'active_support/multibyte/chars' | 
|---|
 | 50 | +require 'active_support/multibyte/exceptions' | 
|---|
 | 51 | +require 'active_support/multibyte/unicode_database' | 
|---|
 | 52 | +require 'active_support/multibyte/utils' | 
|---|
 | 53 | diff --git a/activesupport/lib/active_support/multibyte/chars.rb b/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
 | 54 | index 3d392d2..16bc130 100644 | 
|---|
 | 55 | --- a/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
 | 56 | +++ b/activesupport/lib/active_support/multibyte/chars.rb | 
|---|
 | 57 | @@ -73,16 +73,7 @@ module ActiveSupport #:nodoc: | 
|---|
 | 58 |        UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ | 
|---|
 | 59 |        UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ | 
|---|
 | 60 |   | 
|---|
 | 61 | -      # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) | 
|---|
 | 62 | -      UTF8_PAT = /\A(?: | 
|---|
 | 63 | -                     [\x00-\x7f]                                     | | 
|---|
 | 64 | -                     [\xc2-\xdf] [\x80-\xbf]                         | | 
|---|
 | 65 | -                     \xe0        [\xa0-\xbf] [\x80-\xbf]             | | 
|---|
 | 66 | -                     [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             | | 
|---|
 | 67 | -                     \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | | 
|---|
 | 68 | -                     [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | | 
|---|
 | 69 | -                     \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] | 
|---|
 | 70 | -                    )*\z/xn | 
|---|
 | 71 | +      UTF8_PAT = ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'] | 
|---|
 | 72 |   | 
|---|
 | 73 |        attr_reader :wrapped_string | 
|---|
 | 74 |        alias to_s wrapped_string | 
|---|
 | 75 | @@ -307,23 +298,23 @@ module ActiveSupport #:nodoc: | 
|---|
 | 76 |        def rstrip | 
|---|
 | 77 |          chars(@wrapped_string.gsub(UNICODE_TRAILERS_PAT, '')) | 
|---|
 | 78 |        end | 
|---|
 | 79 | -       | 
|---|
 | 80 | + | 
|---|
 | 81 |        # Strips entire range of Unicode whitespace from the left of the string. | 
|---|
 | 82 |        def lstrip | 
|---|
 | 83 |          chars(@wrapped_string.gsub(UNICODE_LEADERS_PAT, '')) | 
|---|
 | 84 |        end | 
|---|
 | 85 | -       | 
|---|
 | 86 | + | 
|---|
 | 87 |        # Strips entire range of Unicode whitespace from the right and left of the string. | 
|---|
 | 88 |        def strip | 
|---|
 | 89 |          rstrip.lstrip | 
|---|
 | 90 |        end | 
|---|
 | 91 | -       | 
|---|
 | 92 | + | 
|---|
 | 93 |        # Returns the number of codepoints in the string | 
|---|
 | 94 |        def size | 
|---|
 | 95 |          self.class.u_unpack(@wrapped_string).size | 
|---|
 | 96 |        end | 
|---|
 | 97 |        alias_method :length, :size | 
|---|
 | 98 | -       | 
|---|
 | 99 | + | 
|---|
 | 100 |        # Reverses all characters in the string. | 
|---|
 | 101 |        # | 
|---|
 | 102 |        # Example: | 
|---|
 | 103 | @@ -331,7 +322,7 @@ module ActiveSupport #:nodoc: | 
|---|
 | 104 |        def reverse | 
|---|
 | 105 |          chars(self.class.u_unpack(@wrapped_string).reverse.pack('U*')) | 
|---|
 | 106 |        end | 
|---|
 | 107 | -       | 
|---|
 | 108 | + | 
|---|
 | 109 |        # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that | 
|---|
 | 110 |        # character. | 
|---|
 | 111 |        # | 
|---|
 | 112 | @@ -646,7 +637,7 @@ module ActiveSupport #:nodoc: | 
|---|
 | 113 |            string.split(//u).map do |c| | 
|---|
 | 114 |              c.force_encoding(Encoding::ASCII) if c.respond_to?(:force_encoding) | 
|---|
 | 115 |   | 
|---|
 | 116 | -            if !UTF8_PAT.match(c) | 
|---|
 | 117 | +            if !ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'].match(c) | 
|---|
 | 118 |                n = c.unpack('C')[0] | 
|---|
 | 119 |                n < 128 ? n.chr : | 
|---|
 | 120 |                n < 160 ? [UCD.cp1252[n] || n].pack('U') : | 
|---|
 | 121 | diff --git a/activesupport/lib/active_support/multibyte/utils.rb b/activesupport/lib/active_support/multibyte/utils.rb | 
|---|
 | 122 | new file mode 100644 | 
|---|
 | 123 | index 0000000..acef84d | 
|---|
 | 124 | --- /dev/null | 
|---|
 | 125 | +++ b/activesupport/lib/active_support/multibyte/utils.rb | 
|---|
 | 126 | @@ -0,0 +1,61 @@ | 
|---|
 | 127 | +# encoding: utf-8 | 
|---|
 | 128 | + | 
|---|
 | 129 | +module ActiveSupport #:nodoc: | 
|---|
 | 130 | +  module Multibyte #:nodoc: | 
|---|
 | 131 | +    if Kernel.const_defined?(:Encoding) | 
|---|
 | 132 | +      # Returns a regular expression that matches valid characters in the current encoding | 
|---|
 | 133 | +      def self.valid_character | 
|---|
 | 134 | +        VALID_CHARACTER[Encoding.default_internal.to_s] | 
|---|
 | 135 | +      end | 
|---|
 | 136 | +    else | 
|---|
 | 137 | +      def self.valid_character | 
|---|
 | 138 | +        case $KCODE | 
|---|
 | 139 | +        when 'UTF8' | 
|---|
 | 140 | +          VALID_CHARACTER['UTF-8'] | 
|---|
 | 141 | +        when 'SJIS' | 
|---|
 | 142 | +          VALID_CHARACTER['Shift_JIS'] | 
|---|
 | 143 | +        end | 
|---|
 | 144 | +      end | 
|---|
 | 145 | +    end | 
|---|
 | 146 | + | 
|---|
 | 147 | +    if 'string'.respond_to?(:valid_encoding?) | 
|---|
 | 148 | +      # Verifies the encoding of a string | 
|---|
 | 149 | +      def self.verify(string) | 
|---|
 | 150 | +        string.valid_encoding? | 
|---|
 | 151 | +      end | 
|---|
 | 152 | +    else | 
|---|
 | 153 | +      def self.verify(string) | 
|---|
 | 154 | +        if expression = valid_character | 
|---|
 | 155 | +          for c in string.split(//) | 
|---|
 | 156 | +            return false unless valid_character.match(c) | 
|---|
 | 157 | +          end | 
|---|
 | 158 | +        end | 
|---|
 | 159 | +        true | 
|---|
 | 160 | +      end | 
|---|
 | 161 | +    end | 
|---|
 | 162 | + | 
|---|
 | 163 | +    # Verifies the encoding of the string and raises an exception when it's not valid | 
|---|
 | 164 | +    def self.verify!(string) | 
|---|
 | 165 | +      raise EncodingError.new("Found characters with invalid encoding") unless verify(string) | 
|---|
 | 166 | +    end | 
|---|
 | 167 | + | 
|---|
 | 168 | +    if 'string'.respond_to?(:force_encoding) | 
|---|
 | 169 | +      # Removes all invalid characters from the string. | 
|---|
 | 170 | +      # | 
|---|
 | 171 | +      # Note: this method is a no-op in Ruby 1.9 | 
|---|
 | 172 | +      def self.clean(string) | 
|---|
 | 173 | +        string | 
|---|
 | 174 | +      end | 
|---|
 | 175 | +    else | 
|---|
 | 176 | +      def self.clean(string) | 
|---|
 | 177 | +        if expression = valid_character | 
|---|
 | 178 | +          stripped = []; for c in string.split(//) | 
|---|
 | 179 | +            stripped << c if valid_character.match(c) | 
|---|
 | 180 | +          end; stripped.join | 
|---|
 | 181 | +        else | 
|---|
 | 182 | +          string | 
|---|
 | 183 | +        end | 
|---|
 | 184 | +      end | 
|---|
 | 185 | +    end | 
|---|
 | 186 | +  end | 
|---|
 | 187 | +end | 
|---|
 | 188 | \ No newline at end of file | 
|---|
 | 189 | diff --git a/activesupport/test/multibyte_utils_test.rb b/activesupport/test/multibyte_utils_test.rb | 
|---|
 | 190 | new file mode 100644 | 
|---|
 | 191 | index 0000000..d8ac5ff | 
|---|
 | 192 | --- /dev/null | 
|---|
 | 193 | +++ b/activesupport/test/multibyte_utils_test.rb | 
|---|
 | 194 | @@ -0,0 +1,141 @@ | 
|---|
 | 195 | +# encoding: utf-8 | 
|---|
 | 196 | + | 
|---|
 | 197 | +require 'abstract_unit' | 
|---|
 | 198 | +require 'multibyte_test_helpers' | 
|---|
 | 199 | + | 
|---|
 | 200 | +class MultibyteUtilsTest < ActiveSupport::TestCase | 
|---|
 | 201 | +  include MultibyteTestHelpers | 
|---|
 | 202 | + | 
|---|
 | 203 | +  test "valid_character returns an expression for the current encoding" do | 
|---|
 | 204 | +    with_encoding('None') do | 
|---|
 | 205 | +      assert_nil ActiveSupport::Multibyte.valid_character | 
|---|
 | 206 | +    end | 
|---|
 | 207 | +    with_encoding('UTF8') do | 
|---|
 | 208 | +      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['UTF-8'], ActiveSupport::Multibyte.valid_character | 
|---|
 | 209 | +    end | 
|---|
 | 210 | +    with_encoding('SJIS') do | 
|---|
 | 211 | +      assert_equal ActiveSupport::Multibyte::VALID_CHARACTER['Shift_JIS'], ActiveSupport::Multibyte.valid_character | 
|---|
 | 212 | +    end | 
|---|
 | 213 | +  end | 
|---|
 | 214 | + | 
|---|
 | 215 | +  test "verify verifies ASCII strings are properly encoded" do | 
|---|
 | 216 | +    with_encoding('None') do | 
|---|
 | 217 | +      examples.each do |example| | 
|---|
 | 218 | +        assert ActiveSupport::Multibyte.verify(example) | 
|---|
 | 219 | +      end | 
|---|
 | 220 | +    end | 
|---|
 | 221 | +  end | 
|---|
 | 222 | + | 
|---|
 | 223 | +  test "verify verifies UTF-8 strings are properly encoded" do | 
|---|
 | 224 | +    with_encoding('UTF8') do | 
|---|
 | 225 | +      assert ActiveSupport::Multibyte.verify(example('valid UTF-8')) | 
|---|
 | 226 | +      assert !ActiveSupport::Multibyte.verify(example('invalid UTF-8')) | 
|---|
 | 227 | +    end | 
|---|
 | 228 | +  end | 
|---|
 | 229 | + | 
|---|
 | 230 | +  test "verify verifies Shift-JIS strings are properly encoded" do | 
|---|
 | 231 | +    with_encoding('SJIS') do | 
|---|
 | 232 | +      assert ActiveSupport::Multibyte.verify(example('valid Shift-JIS')) | 
|---|
 | 233 | +      assert !ActiveSupport::Multibyte.verify(example('invalid Shift-JIS')) | 
|---|
 | 234 | +    end | 
|---|
 | 235 | +  end | 
|---|
 | 236 | + | 
|---|
 | 237 | +  test "verify! raises an exception when it finds an invalid character" do | 
|---|
 | 238 | +    with_encoding('UTF8') do | 
|---|
 | 239 | +      assert_raises(ActiveSupport::Multibyte::EncodingError) do | 
|---|
 | 240 | +        ActiveSupport::Multibyte.verify!(example('invalid UTF-8')) | 
|---|
 | 241 | +      end | 
|---|
 | 242 | +    end | 
|---|
 | 243 | +  end | 
|---|
 | 244 | + | 
|---|
 | 245 | +  test "verify! doesn't raise an exception when the encoding is valid" do | 
|---|
 | 246 | +    with_encoding('UTF8') do | 
|---|
 | 247 | +      assert_nothing_raised do | 
|---|
 | 248 | +        ActiveSupport::Multibyte.verify!(example('valid UTF-8')) | 
|---|
 | 249 | +      end | 
|---|
 | 250 | +    end | 
|---|
 | 251 | +  end | 
|---|
 | 252 | + | 
|---|
 | 253 | +  if RUBY_VERSION < '1.9' | 
|---|
 | 254 | +    test "clean leaves ASCII strings intact" do | 
|---|
 | 255 | +      with_encoding('None') do | 
|---|
 | 256 | +        [ | 
|---|
 | 257 | +          'word', "\270\236\010\210\245" | 
|---|
 | 258 | +        ].each do |string| | 
|---|
 | 259 | +          assert_equal string, ActiveSupport::Multibyte.clean(string) | 
|---|
 | 260 | +        end | 
|---|
 | 261 | +      end | 
|---|
 | 262 | +    end | 
|---|
 | 263 | + | 
|---|
 | 264 | +    test "clean cleans invalid characters from UTF-8 encoded strings" do | 
|---|
 | 265 | +      with_encoding('UTF8') do | 
|---|
 | 266 | +        cleaned_utf8 = [8].pack('C*') | 
|---|
 | 267 | +        assert_equal example('valid UTF-8'), ActiveSupport::Multibyte.clean(example('valid UTF-8')) | 
|---|
 | 268 | +        assert_equal cleaned_utf8, ActiveSupport::Multibyte.clean(example('invalid UTF-8')) | 
|---|
 | 269 | +      end | 
|---|
 | 270 | +    end | 
|---|
 | 271 | + | 
|---|
 | 272 | +    test "clean cleans invalid characters from Shift-JIS encoded strings" do | 
|---|
 | 273 | +      with_encoding('SJIS') do | 
|---|
 | 274 | +        cleaned_sjis = [184, 0, 136, 165].pack('C*') | 
|---|
 | 275 | +        assert_equal example('valid Shift-JIS'), ActiveSupport::Multibyte.clean(example('valid Shift-JIS')) | 
|---|
 | 276 | +        assert_equal cleaned_sjis, ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) | 
|---|
 | 277 | +      end | 
|---|
 | 278 | +    end | 
|---|
 | 279 | +  else | 
|---|
 | 280 | +    test "clean is a no-op" do | 
|---|
 | 281 | +      with_encoding('UTF8') do | 
|---|
 | 282 | +        assert_equal example('invalid Shift-JIS'), ActiveSupport::Multibyte.clean(example('invalid Shift-JIS')) | 
|---|
 | 283 | +      end | 
|---|
 | 284 | +    end | 
|---|
 | 285 | +  end | 
|---|
 | 286 | + | 
|---|
 | 287 | +  private | 
|---|
 | 288 | + | 
|---|
 | 289 | +  STRINGS = { | 
|---|
 | 290 | +    'valid ASCII'       => [65, 83, 67, 73, 73].pack('C*'), | 
|---|
 | 291 | +    'invalid ASCII'     => [128].pack('C*'), | 
|---|
 | 292 | +    'valid UTF-8'       => [227, 129, 147, 227, 129, 171, 227, 129, 161, 227, 130, 143].pack('C*'), | 
|---|
 | 293 | +    'invalid UTF-8'     => [184, 158, 8, 136, 165].pack('C*'), | 
|---|
 | 294 | +    'valid Shift-JIS'   => [131, 122, 129, 91, 131, 128].pack('C*'), | 
|---|
 | 295 | +    'invalid Shift-JIS' => [184, 158, 8, 0, 255, 136, 165].pack('C*') | 
|---|
 | 296 | +  } | 
|---|
 | 297 | + | 
|---|
 | 298 | +  if Kernel.const_defined?(:Encoding) | 
|---|
 | 299 | +    def example(key) | 
|---|
 | 300 | +      STRINGS[key].force_encoding(Encoding.default_internal) | 
|---|
 | 301 | +    end | 
|---|
 | 302 | + | 
|---|
 | 303 | +    def examples | 
|---|
 | 304 | +      STRINGS.values.map { |s| s.force_encoding(Encoding.default_internal) } | 
|---|
 | 305 | +    end | 
|---|
 | 306 | +  else | 
|---|
 | 307 | +    def example(key) | 
|---|
 | 308 | +      STRINGS[key] | 
|---|
 | 309 | +    end | 
|---|
 | 310 | + | 
|---|
 | 311 | +    def examples | 
|---|
 | 312 | +      STRINGS.values | 
|---|
 | 313 | +    end | 
|---|
 | 314 | +  end | 
|---|
 | 315 | + | 
|---|
 | 316 | +  if 'string'.respond_to?(:encoding) | 
|---|
 | 317 | +    def with_encoding(enc) | 
|---|
 | 318 | +      before = Encoding.default_internal | 
|---|
 | 319 | + | 
|---|
 | 320 | +      case enc | 
|---|
 | 321 | +      when 'UTF8' | 
|---|
 | 322 | +        Encoding.default_internal = Encoding::UTF_8 | 
|---|
 | 323 | +      when 'SJIS' | 
|---|
 | 324 | +        Encoding.default_internal = Encoding::Shift_JIS | 
|---|
 | 325 | +      else | 
|---|
 | 326 | +        Encoding.default_internal = Encoding::BINARY | 
|---|
 | 327 | +      end | 
|---|
 | 328 | +      yield | 
|---|
 | 329 | + | 
|---|
 | 330 | +      Encoding.default_internal = before | 
|---|
 | 331 | +    end | 
|---|
 | 332 | +  else | 
|---|
 | 333 | +    alias with_encoding with_kcode | 
|---|
 | 334 | +  end | 
|---|
 | 335 | +end | 
|---|
 | 336 | \ No newline at end of file | 
|---|
 | 337 |  | 
|---|