001/* 002 * This file is part of the Jikes RVM project (http://jikesrvm.org). 003 * 004 * This file is licensed to You under the Eclipse Public License (EPL); 005 * You may not use this file except in compliance with the License. You 006 * may obtain a copy of the License at 007 * 008 * http://www.opensource.org/licenses/eclipse-1.0.php 009 * 010 * See the COPYRIGHT.txt file distributed with this work for information 011 * regarding copyright ownership. 012 */ 013package org.jikesrvm.classloader; 014 015import java.io.UTFDataFormatException; 016import java.nio.ByteBuffer; 017import org.vmmagic.pragma.Pure; 018import org.jikesrvm.VM; 019import org.vmmagic.pragma.Inline; 020import org.vmmagic.pragma.NoInline; 021 022/** 023 * Abstract class that contains conversion routines to/from utf8 024 * and/or pseudo-utf8. It does not support utf8 encodings of 025 * more than 3 bytes. 026 * <p> 027 * The difference between utf8 and pseudo-utf8 is the special 028 * treatment of null. In utf8, null is encoded as a single byte 029 * directly, whereas in pseudo-utf8, it is encoded as a two-byte 030 * sequence. See the JVM specification for more information. 031 */ 032public abstract class UTF8Convert { 033 034 /** 035 * Strictly check the format of the utf8/pseudo-utf8 byte array in 036 * fromUTF8. 037 */ 038 static final boolean STRICTLY_CHECK_FORMAT = false; 039 /** 040 * Set fromUTF8 to not throw an exception when given a normal utf8 041 * byte array. 042 */ 043 static final boolean ALLOW_NORMAL_UTF8 = false; 044 /** 045 * Set fromUTF8 to not throw an exception when given a pseudo utf8 046 * byte array. 047 */ 048 static final boolean ALLOW_PSEUDO_UTF8 = true; 049 /** 050 * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8). 051 */ 052 static final boolean WRITE_PSEUDO_UTF8 = true; 053 054 /** 055 * UTF8 character visitor abstraction 056 */ 057 private abstract static class UTF8CharacterVisitor { 058 abstract void visit_char(char c); 059 } 060 061 /** 062 * Visitor that builds up a char[] as characters are decoded 063 */ 064 private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor { 065 final char[] result; 066 int index; 067 ByteArrayStringEncoderVisitor(int length) { 068 result = new char[length]; 069 index = 0; 070 } 071 @Override 072 void visit_char(char c) { 073 result[index] = c; 074 index++; 075 } 076 @Override 077 public String toString() { 078 if (VM.runningVM) { 079 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index); 080 } else { 081 return new String(result, 0, index); 082 } 083 } 084 } 085 086 /** 087 * Visitor that builds up a char[] as characters are decoded 088 */ 089 private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor { 090 final char[] result; 091 int index; 092 ByteBufferStringEncoderVisitor(int length) { 093 result = new char[length]; 094 index = 0; 095 } 096 @Override 097 void visit_char(char c) { 098 result[index] = c; 099 index++; 100 } 101 @Override 102 public String toString() { 103 if (VM.runningVM) { 104 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index); 105 } else { 106 return new String(result, 0, index); 107 } 108 } 109 } 110 111 /** 112 * Visitor that builds up a String.hashCode form hashCode as characters are decoded 113 */ 114 private static final class StringHashCodeVisitor extends UTF8CharacterVisitor { 115 int result = 0; 116 @Override 117 void visit_char(char c) { 118 result = result * 31 + c; 119 } 120 int getResult() { 121 return result; 122 } 123 } 124 125 /** 126 * Convert the given sequence of (pseudo-)utf8 formatted bytes 127 * into a String.<p> 128 * 129 * The acceptable input formats are controlled by the 130 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 131 * flags. 132 * 133 * @param utf8 (pseudo-)utf8 byte array 134 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 135 * @return unicode string 136 */ 137 public static String fromUTF8(byte[] utf8) throws UTFDataFormatException { 138 UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length); 139 visitUTF8(utf8, visitor); 140 return visitor.toString(); 141 } 142 143 /** 144 * Convert the given sequence of (pseudo-)utf8 formatted bytes 145 * into a String. 146 * 147 * The acceptable input formats are controlled by the 148 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 149 * flags.<p> 150 * 151 * @param utf8 (pseudo-)utf8 byte array 152 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 153 * @return unicode string 154 */ 155 public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException { 156 UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining()); 157 visitUTF8(utf8, visitor); 158 return visitor.toString(); 159 } 160 161 /** 162 * Convert the given sequence of (pseudo-)utf8 formatted bytes 163 * into a String hashCode.<p> 164 * 165 * The acceptable input formats are controlled by the 166 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 167 * flags. 168 * 169 * @param utf8 (pseudo-)utf8 byte array 170 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 171 * @return hashCode corresponding to if this were a String.hashCode 172 */ 173 public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException { 174 StringHashCodeVisitor visitor = new StringHashCodeVisitor(); 175 visitUTF8(utf8, visitor); 176 return visitor.getResult(); 177 } 178 179 @NoInline 180 private static void throwDataFormatException(String message, int location) throws UTFDataFormatException { 181 throw new UTFDataFormatException(message + " at location " + location); 182 } 183 184 /** 185 * Visit all bytes of the given utf8 string calling the visitor when a 186 * character is decoded.<p> 187 * 188 * The acceptable input formats are controlled by the 189 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 190 * flags. 191 * 192 * @param utf8 (pseudo-)utf8 byte array 193 * @param visitor called when characters are decoded 194 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 195 */ 196 @Inline 197 private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException { 198 for (int i = 0, n = utf8.length; i < n;) { 199 byte b = utf8[i++]; 200 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) { 201 if (b == 0) { 202 throwDataFormatException("0 byte encountered", i - 1); 203 } 204 } 205 if (b >= 0) { // < 0x80 unsigned 206 // in the range '\001' to '\177' 207 visitor.visit_char((char) b); 208 continue; 209 } 210 try { 211 byte nb = utf8[i++]; 212 if (b < -32) { // < 0xe0 unsigned 213 // '\000' or in the range '\200' to '\u07FF' 214 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 215 visitor.visit_char(c); 216 if (STRICTLY_CHECK_FORMAT) { 217 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 218 throwDataFormatException("invalid marker bits for double byte char" , i - 2); 219 } 220 if (c < '\200') { 221 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 222 throwDataFormatException("encountered double byte char that should have been single byte", i - 2); 223 } 224 } else if (c > '\u07FF') { 225 throwDataFormatException("encountered double byte char that should have been single byte", i - 2); 226 } 227 } 228 } else { 229 byte nnb = utf8[i++]; 230 // in the range '\u0800' to '\uFFFF' 231 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 232 visitor.visit_char(c); 233 if (STRICTLY_CHECK_FORMAT) { 234 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 235 throwDataFormatException("invalid marker bits for triple byte char", i - 3); 236 } 237 if (c < '\u0800') { 238 throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3); 239 } 240 } 241 } 242 } catch (ArrayIndexOutOfBoundsException e) { 243 throwDataFormatException("unexpected end", i); 244 } 245 } 246 } 247 248 /** 249 * Visit all bytes of the given utf8 string calling the visitor when a 250 * character is decoded.<p> 251 * 252 * The acceptable input formats are controlled by the 253 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 254 * flags. 255 * 256 * @param utf8 (pseudo-)utf8 byte array 257 * @param visitor called when characters are decoded 258 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 259 */ 260 @Inline 261 private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException { 262 while (utf8.hasRemaining()) { 263 byte b = utf8.get(); 264 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) { 265 if (b == 0) { 266 throwDataFormatException("0 byte encountered", utf8.position() - 1); 267 } 268 } 269 if (b >= 0) { // < 0x80 unsigned 270 // in the range '\001' to '\177' 271 visitor.visit_char((char) b); 272 continue; 273 } 274 try { 275 byte nb = utf8.get(); 276 if (b < -32) { // < 0xe0 unsigned 277 // '\000' or in the range '\200' to '\u07FF' 278 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 279 visitor.visit_char(c); 280 if (STRICTLY_CHECK_FORMAT) { 281 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 282 throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2); 283 } 284 if (c < '\200') { 285 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 286 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2); 287 } 288 } else if (c > '\u07FF') { 289 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2); 290 } 291 } 292 } else { 293 byte nnb = utf8.get(); 294 // in the range '\u0800' to '\uFFFF' 295 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 296 visitor.visit_char(c); 297 if (STRICTLY_CHECK_FORMAT) { 298 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 299 throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3); 300 } 301 if (c < '\u0800') { 302 throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3); 303 } 304 } 305 } 306 } catch (ArrayIndexOutOfBoundsException e) { 307 throwDataFormatException("unexpected end", utf8.position()); 308 } 309 } 310 } 311 312 /** 313 * Convert the given String into a sequence of (pseudo-)utf8 314 * formatted bytes.<p> 315 * 316 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag. 317 * 318 * @param s String to convert 319 * @return array containing sequence of (pseudo-)utf8 formatted bytes 320 */ 321 public static byte[] toUTF8(String s) { 322 byte[] result = new byte[utfLength(s)]; 323 int result_index = 0; 324 for (int i = 0, n = s.length(); i < n; ++i) { 325 char c = s.charAt(i); 326 // in all shifts below, c is an (unsigned) char, 327 // so either >>> or >> is ok 328 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 329 result[result_index++] = (byte) c; 330 } else if (c > 0x07FF) { 331 result[result_index++] = (byte) (0xe0 | (byte) (c >> 12)); 332 result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6)); 333 result[result_index++] = (byte) (0x80 | (c & 0x3f)); 334 } else { 335 result[result_index++] = (byte) (0xc0 | (byte) (c >> 6)); 336 result[result_index++] = (byte) (0x80 | (c & 0x3f)); 337 } 338 } 339 return result; 340 } 341 342 /** 343 * Convert the given String into a sequence of (pseudo-)utf8 344 * formatted bytes.<p> 345 * 346 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag. 347 * 348 * @param s String to convert 349 * @param b Byte buffer to hold result 350 */ 351 @Inline 352 public static void toUTF8(String s, ByteBuffer b) { 353 int result_index = 0; 354 for (int i = 0, n = s.length(); i < n; ++i) { 355 char c = s.charAt(i); 356 // in all shifts below, c is an (unsigned) char, 357 // so either >>> or >> is ok 358 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 359 b.put((byte) c); 360 } else if (c > 0x07FF) { 361 b.put((byte) (0xe0 | (byte) (c >> 12))); 362 b.put((byte) (0x80 | ((c & 0xfc0) >> 6))); 363 b.put((byte) (0x80 | (c & 0x3f))); 364 } else { 365 b.put((byte) (0xc0 | (byte) (c >> 6))); 366 b.put((byte) (0x80 | (c & 0x3f))); 367 } 368 } 369 } 370 371 @Pure 372 public static int utfLength(String s) { 373 int utflen = 0; 374 for (int i = 0, n = s.length(); i < n; ++i) { 375 int c = s.charAt(i); 376 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 377 ++utflen; 378 } else if (c > 0x07FF) { 379 utflen += 3; 380 } else { 381 utflen += 2; 382 } 383 } 384 return utflen; 385 } 386 387 /** 388 * Check whether the given sequence of bytes is valid (pseudo-)utf8. 389 * 390 * @param bytes byte array to check 391 * @return {@code true} iff the given sequence is valid (pseudo-)utf8. 392 */ 393 public static boolean check(byte[] bytes) { 394 for (int i = 0, n = bytes.length; i < n;) { 395 byte b = bytes[i++]; 396 if (!ALLOW_NORMAL_UTF8) { 397 if (b == 0) return false; 398 } 399 if (b >= 0) { // < 0x80 unsigned 400 // in the range '\001' to '\177' 401 continue; 402 } 403 try { 404 byte nb = bytes[i++]; 405 if (b < -32) { // < 0xe0 unsigned 406 // '\000' or in the range '\200' to '\u07FF' 407 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 408 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 409 return false; 410 } 411 if (c < '\200') { 412 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 413 return false; 414 } 415 } else if (c > '\u07FF') { 416 return false; 417 } 418 } else { 419 byte nnb = bytes[i++]; 420 // in the range '\u0800' to '\uFFFF' 421 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 422 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 423 return false; 424 } 425 if (c < '\u0800') { 426 return false; 427 } 428 } 429 } catch (ArrayIndexOutOfBoundsException e) { 430 return false; 431 } 432 } 433 return true; 434 } 435}