001package com.randomnoun.common; 002 003/* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a 004 * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html) 005 */ 006import java.io.ByteArrayOutputStream; 007import java.io.File; 008import java.io.FileInputStream; 009import java.io.IOException; 010import java.io.Reader; 011import java.io.UnsupportedEncodingException; 012import java.security.MessageDigest; 013import java.security.NoSuchAlgorithmException; 014import java.text.Collator; 015import java.text.ParseException; 016import java.text.SimpleDateFormat; 017import java.util.ArrayList; 018import java.util.BitSet; 019import java.util.Comparator; 020import java.util.Date; 021import java.util.Iterator; 022import java.util.List; 023import java.util.Map; 024import java.util.regex.Matcher; 025import java.util.regex.Pattern; 026 027/** Text utility functions 028 * 029 * @author knoxg 030 */ 031public class Text { 032 033 /** Used to prevent massive debug dumps. See {@link #getDisplayString(String, String)} */ 034 private static final int MAX_STRING_OUTPUT_CHARS = 300; 035 036 /** Left-justification constant for use in the {@link #pad(String, int, int)} method */ 037 public static final int JUSTIFICATION_LEFT = 0; 038 039 /** Center-justification constant for use in the {@link #pad(String, int, int)} method */ 040 public static final int JUSTIFICATION_CENTER = 1; 041 042 /** Right-justification constant for use in the {@link #pad(String, int, int)} method */ 043 public static final int JUSTIFICATION_RIGHT = 2; 044 045 public static Pattern scriptPattern = Pattern.compile("<(/script)", Pattern.CASE_INSENSITIVE); 046 047 /** Returns true if the supplied string is null or the empty string, false otherwise 048 * 049 * @param text The string to test 050 * @return true if the supplied string is null or the empty string, false otherwise 051 */ 052 public static boolean isBlank(String text) { 053 return (text == null || text.equals("")); 054 } 055 056 /** Returns true if the supplied string is non-null and only contains numeric characters 057 * 058 * @param text The string to test 059 * @return true if the supplied string is non-null and only contains numeric characters 060 */ 061 public static boolean isNumeric(String text) { 062 if (text == null) { 063 return false; 064 } 065 char ch; 066 for (int i = 0; i < text.length(); i++) { 067 ch = text.charAt(i); 068 if (ch < '0' || ch > '9') { 069 return false; 070 } 071 } 072 return true; 073 } 074 075 /** Returns true if the supplied string is non-null and only contains numeric characters 076 * or a single decimal point. The value can have a leading negative ('-') symbol. 077 * 078 * @param text The string to test 079 * @return true if the supplied string is non-null and only contains numeric characters, 080 * which may contain a '.' character in there somewhere. 081 */ 082 public static boolean isNumericDecimal(String text) { 083 if (text == null) { 084 return false; 085 } 086 boolean seenPoint = false; // existential quandary there for you 087 char ch; 088 int len = text.length(); 089 for (int i = 0; i < len; i++) { 090 ch = text.charAt(i); 091 if (ch=='.') { 092 if (seenPoint) { return false; } 093 seenPoint = true; 094 } else if (ch == '-' && i == 0) { 095 // leading negative symbol OK 096 if (len == 1) { 097 // but not if it's the only character in the string 098 return false; 099 } 100 } else if (ch < '0' || ch > '9') { 101 return false; 102 } 103 } 104 return true; 105 } 106 107 /** Returns true if the supplied string is non-null and only contains numeric characters 108 * or a single decimal point. The value can have a leading negative ('-') symbol. 109 * 110 * This version allows exponents ("E+nn" or "E-nn") to the end of the value. 111 * 112 * @param text The string to test 113 * @return true if the supplied string is non-null and only contains numeric characters, 114 * which may contain a '.' character in there somewhere. 115 */ 116 public static boolean isNumericDecimalExp(String text) { 117 if (text == null) { 118 return false; 119 } 120 boolean seenPoint = false; // existential quandary there for you 121 int expPos = -1; // position of the 'E' character 122 char ch; 123 for (int i = 0; i < text.length(); i++) { 124 ch = text.charAt(i); 125 if (ch=='E') { 126 if (expPos != -1) { return false; } 127 expPos = i; 128 } else if (ch=='.' && expPos == -1) { 129 if (seenPoint) { return false; } 130 seenPoint = true; 131 } else if ((ch == '+' || ch == '-') && i == expPos + 1) { 132 // + or - directly after 'E' OK 133 } else if (ch == '-' && i == 0) { 134 // leading negative symbol OK 135 } else if (ch < '0' || ch > '9') { 136 return false; 137 } 138 } 139 return true; 140 } 141 142 143 /** Ensures that a string returned from a browser (on any platform) conforms 144 * to unix line-EOF conventions. Any instances of consecutive CRs (<code>0xD</code>) 145 * and LFs (<code>0xA</code>) in a string will be reduced to a series of CRs (the number of CRs will be the 146 * maximum number of CRs or LFs found in a row). 147 * 148 * @param input the input string 149 * 150 * @return the canonicalised string, as described above 151 */ 152 public static String reduceNewlines(String input) { 153 StringBuilder sb = new StringBuilder(); 154 int len = input.length(); 155 int crCount = 0; 156 int lfCount = 0; 157 boolean insertNewline = false; 158 char ch; 159 for (int i=0; i<len; i++) { 160 ch = input.charAt(i); 161 if (ch == (char) 0xA) { 162 lfCount ++; insertNewline = true; 163 } else if (ch == (char) 0xD) { 164 crCount ++; insertNewline = true; 165 } else if (insertNewline) { 166 for (int j=0; j<Math.max(lfCount, crCount); j++) { 167 sb.append((char) 0xA); 168 } 169 insertNewline = false; lfCount=0; crCount=0; 170 sb.append(ch); 171 } else { 172 sb.append(ch); 173 } 174 } 175 if (insertNewline) { 176 for (int j=0; j<Math.max(lfCount, crCount); j++) { 177 sb.append((char) 0xA); 178 } 179 } 180 181 return sb.toString(); 182 } 183 184 185 /** 186 * Returns the HTML-escaped form of a string. The <code>&</code>, 187 * <code><</code>, <code>></code>, and <code>"</code> characters are converted to 188 * <code>&amp;</code>, <code>&lt;</code>, <code>&gt;</code>, and 189 * <code>&quot;</code> respectively. 190 * 191 * <p>Characters in the unicode control code blocks ( apart from \t, \n and \r ) are converted to &xfffd; 192 * <p>Characters outside of the ASCII printable range are converted into &xnnnn; form 193 * 194 * @param string the string to convert 195 * 196 * @return the HTML-escaped form of the string 197 */ 198 static public String escapeHtml(String string) { 199 if (string == null) { 200 return ""; 201 } 202 char c; 203 String hex; 204 StringBuilder sb = new StringBuilder(string.length()); 205 for (int i = 0; i < string.length(); i++) { 206 c = string.charAt(i); 207 // check for illegal characters 208 switch (c) { 209 case '&': 210 sb.append("&"); 211 break; 212 case '<': 213 sb.append("<"); 214 break; 215 case '>': 216 sb.append(">"); 217 break; 218 case '\"': 219 // interestingly, "e; (with the e) works fine for HTML display, 220 // but not inside hidden field values 221 sb.append("""); 222 break; 223 default: 224 // 'illegal characters' according to ESAPI. 7f to 9f are control characters in unicode 225 if ( ( c <= 0x1f && c != '\t' && c != '\n' && c != '\r' ) || ( c >= 0x7f && c <= 0x9f ) ) { 226 sb.append("�"); // REPLACEMENT_HEX in ESAPI's HtmlEntityCodec 227 } else if ( c > 0x1f && c <= 0x7f ) { 228 // safe printable 229 sb.append(c); 230 } else { 231 // ESAPI didn't have the else block above, which was causing it escape everything 232 hex = getHexForNonAlphanumeric(c); 233 sb.append("&#x" + hex + ";"); 234 } 235 236 } 237 } 238 239 return sb.toString(); 240 } 241 242 /** 243 * Returns a regex-escaped form of a string. That is, the pattern 244 * returned by this method, if compiled into a regex, will match 245 * the supplied string exactly. 246 * 247 * @param string the string to convert 248 * 249 * @return the HTML-escaped form of the string 250 */ 251 static public String escapeRegex(String string) { 252 if (string == null) { 253 return ""; 254 } 255 256 char c; 257 StringBuilder sb = new StringBuilder(string.length()); 258 259 for (int i = 0; i < string.length(); i++) { 260 c = string.charAt(i); 261 262 switch (c) { 263 case '.': 264 case '+': // intentional fall-through 265 case '?': // intentional fall-through 266 case '\\': // intentional fall-through 267 case '{': // intentional fall-through 268 case '}': // intentional fall-through 269 case '[': // intentional fall-through 270 case ']': // intentional fall-through 271 case '^': // intentional fall-through 272 case '$': // intentional fall-through 273 case '(': // intentional fall-through 274 case '|': // intentional fall-through 275 case ')': // intentional fall-through 276 sb.append("\\"); 277 sb.append(c); 278 break; 279 default: 280 sb.append(c); 281 } 282 } 283 284 return sb.toString(); 285 } 286 287 288 /** 289 * Returns the csv-escaped form of a string. A csv-escaped string is 290 * used when writing to a CSV (comma-separated-value) file. It ensures 291 * that commas included within a string are quoted. We use the Microsoft-Excel 292 * quoting rules, so that our CSV files can be imported into that. These rules 293 * (derived from experimentation) are: 294 * 295 * <ul> 296 * <li>Strings without commas (,) inverted commas ("), or newlines (\n) are returned as-is. 297 * <li>Otherwise, the string is surrounded by inverted commas, and any 298 * inverted commas within the string are doubled-up (i.e. '"' becomes '""'). 299 * <li>A value that starts with any of "=", "@", "+" or "-" has a leading single apostrophe added 300 * to prevent the value being evaluated in Excel. The leading quote is visible to the user when the 301 * csv is opened, which may mean that it will have to be removed when roundtripping data. 302 * This may complicate things if the user actually wants a leading single quote in their CSV value. 303 * </ul> 304 * 305 * <p>Embedded newlines are inserted as-is, as per Excel. This will require 306 * some care whilst parsing if we want to be able to read these files. 307 * 308 * @param string the string to convert 309 * 310 * @return the csv-escaped form of the string 311 */ 312 static public String escapeCsv(String string) { 313 if (string == null) { 314 return ""; 315 } 316 317 boolean quoted = false; 318 // from https://www.contextis.com/en/blog/comma-separated-vulnerabilities 319 // prefix cells that start with ‘=’ , '@', '+' or '-' with an apostrophe 320 // This will ensure that the cell isn’t interpreted as a formula, and as a bonus in Microsoft Excel the apostrophe itself will not be displayed. 321 if (string.startsWith("=") || 322 string.startsWith("@")) { 323 // prefix the string with an a single quote char to escape it 324 string = "'" + string; 325 quoted = true; // not sure need to quote here, but doesn't hurt 326 } else if ((string.startsWith("+") || string.startsWith("-")) && 327 (string.length() == 1 || !Text.isNumericDecimalExp(string))) { 328 // numbers can legitimately start with '+' or '-' but anything else should be escaped 329 string = "'" + string; 330 quoted = true; 331 } 332 333 334 if (string.indexOf(',') == -1 && string.indexOf('"') == -1 && string.indexOf('\n') == -1 && !quoted) { 335 return string; 336 } 337 string = Text.replaceString(string, "\"", "\"\""); 338 string = "\"" + string + "\""; 339 340 return string; 341 } 342 343 /** Given a csv-encoded string (as produced by the rules in {@link #escapeCsv(String)}, 344 * produces a List of Strings which represent the individual values in the string. 345 * Note that this method is *not* equivalent to calling <code>Arrays.asList(astring.split(","))</code>. 346 * 347 * <p>Setting the whitespaceSensitive parameter to false allows leading and trailing 348 * whitespace in *non-quoted* values to be removed, e.g. if the input string <code>text</code> is: 349 * 350 * <pre class="code"> 351 * abc,def, ghi, j k ,"lmn"," op "," q,r","""hello""", "another" 352 * </pre> 353 * 354 * then <code>parseCsv(text, <b>false</b>)</code> will return the strings: 355 * <pre class="code"> 356 * abc 357 * def 358 * ghi 359 * j k 360 * lmn 361 * op <i>(this String has one leading space, and a trailing space after 'p')</i> 362 * q,r <i>(this String has one leading space)</i> 363 * "hello" 364 * another 365 * </pre> 366 * 367 * and <code>parseCsv(text, <b>true</b>)</code> would throw a ParseException (since the 368 * final element is a quoted value, but begins with a space). 369 * 370 * If the <code>, "another"</code> text is removed, however, then 371 * <code>parseCsv(text, true)</code> would return the following: 372 * 373 * and <code>parseCsv(text, true)</code> will return the string 374 * <pre> 375 * abc 376 * def 377 * ghi <i>(this String has two leading spaces)</i> 378 * j k <i>(this String has one leading space and a trailing space after the 'k' character)</i> 379 * lmn 380 * op <i>(this String has one leading space, and a trailing space after 'p')</i> 381 * q,r <i>(this String has one leading space)</i> 382 * "hello" 383 * </pre> 384 * 385 * <p>Most applications would want to use the 'whiteSpaceSensitive=false' form of this function, since 386 * (a) less chance of a ParseException, and (b) it's what an end-user would normally 387 * expect. This can be performed by calling the {@link #parseCsv(String)} method. 388 * 389 * <p>Whitespace is determined by using the <code>Character.isSpaceChar()</code> method, 390 * which is Unicode-aware. 391 * 392 * @param text The CSV-encoded string to parse 393 * @param whitespaceSensitive If set to true, will trim leading and trailing whitespace in *non-quoted* values. 394 * 395 * @return a List of Strings. The returned List is guaranteed to always contain at least one element. 396 * 397 * @throws NullPointerException if the text passed to this method is null 398 * @throws ParseException if a quoted value contains leading whitespace before the 399 * opening quote, or after the trailing quote. 400 * @throws ParseException if a quoted value has a start quote, but no end quote, or 401 * if a value has additional text after a quoted value (before the next comma or EOL). 402 */ 403 static public List<String> parseCsv(String text, boolean whitespaceSensitive) 404 throws ParseException { 405 if (text == null) { 406 throw new NullPointerException("null text"); 407 } 408 409 // parse state: 410 // 0=searching for new value (at start of line or after comma) 411 // 1=consuming non-quoted values 412 // 2=consuming quoted value 413 // 3=consumed first quote within a quoted value (may be termining quote or a "" sequence) 414 // 4=consuming whitespace up to next comma/EOL (after quoted value, not whitespaceSensitive) 415 int parseState = 0; 416 int length = text.length(); 417 String element; 418 List<String> result = new ArrayList<String>(); 419 char ch; 420 StringBuilder buffer = new StringBuilder(); 421 422 for (int pos = 0; pos < length; pos++) { 423 ch = text.charAt(pos); 424 425 // System.out.println("pos " + pos + ", state=" + parseState + ", nextchar=" + ch + ", buf=" + buffer); 426 switch (parseState) { 427 case 0: 428 if (Character.isSpaceChar(ch)) { 429 if (whitespaceSensitive) { 430 buffer.append(ch); 431 parseState = 1; 432 } else { 433 // ignore 434 } 435 } else if (ch == '"') { 436 parseState = 2; 437 } else if (ch == ',') { 438 result.add(""); // add an empty element; state remains unchanged 439 } else { 440 buffer.append(ch); 441 parseState = 1; 442 } 443 break; 444 case 1: 445 if (ch == ',') { 446 element = buffer.toString(); 447 if (!whitespaceSensitive) { 448 element = element.trim(); 449 } 450 result.add(element); 451 buffer.setLength(0); 452 parseState = 0; 453 } else { 454 buffer.append(ch); 455 } 456 break; 457 case 2: 458 if (ch == '"') { 459 parseState = 3; 460 } else { 461 buffer.append(ch); 462 } 463 break; 464 case 3: 465 if (ch == '"') { 466 buffer.append('"'); 467 parseState = 2; 468 } else if (ch == ',') { 469 result.add(buffer.toString()); 470 buffer.setLength(0); 471 parseState = 0; 472 } else if (Character.isSpaceChar(ch)) { 473 if (whitespaceSensitive) { 474 throw new ParseException("Cannot have trailing whitespace after close quote character", pos); 475 } 476 parseState = 4; 477 } else { 478 throw new ParseException("Cannot have trailing data after close quote character", pos); 479 } 480 break; 481 case 4: 482 if (Character.isSpaceChar(ch)) { 483 // consume and ignore 484 } else if (ch == ',') { 485 result.add(buffer.toString()); 486 buffer.setLength(0); 487 parseState = 0; 488 } else { 489 throw new ParseException("Cannot have trailing data after close quote character", pos); 490 } 491 break; 492 493 default: 494 throw new IllegalStateException("Illegal state '" + parseState + "' in parseCsv"); 495 } 496 } 497 498 // if state is 2, we are in the middle of a quoted value 499 if (parseState == 2) { 500 throw new ParseException("Missing endquote in csv text", length); 501 } 502 503 // otherwise we still need to add what's left in the buffer into the result list 504 element = buffer.toString(); 505 if (parseState == 1 && !whitespaceSensitive) { 506 element = element.trim(); 507 } 508 result.add(element); 509 return result; 510 } 511 512 @FunctionalInterface 513 public interface CsvLineReader { // doesn't extend Supplier<T> as it throws exceptions 514 /** Returns the next logical line in the CSV ( quoted values can contain newlines ) 515 * 516 * @return 517 * @throws ParseException 518 * @throws IOException 519 */ 520 List<String> readLine() throws ParseException, IOException; 521 } 522 523 // same as parseCsv(String, whitespaceSensitive) but can handle newlines in quotes by supplying a Reader 524 // the returned object will return a List<String> or null if EOF is reached 525 // ParseExceptions are wrapped in something, probably 526 static public CsvLineReader parseCsv(Reader r, boolean whitespaceSensitive) { 527 if (r == null) { 528 throw new NullPointerException("null reader"); 529 } 530 return new CsvLineReader() { 531 // eof if we actually read eof or encouner a parse exception ( cannot recover ) 532 boolean isAtStart = true; // for backwards compatibility with Text.parseCsv(""), first readLine() is never null 533 boolean isEOF = false; 534 @Override 535 public List<String> readLine() throws ParseException, IOException { 536 if (isEOF) { return null; } 537 538 // parse state: 539 // 0=searching for new value (at start of line or after comma) 540 // 1=consuming non-quoted values 541 // 2=consuming quoted value 542 // 3=consumed first quote within a quoted value (may be termining quote or a "" sequence) 543 // 4=consuming whitespace up to next comma/EOL (after quoted value, not whitespaceSensitive) 544 int parseState = 0; 545 // int length = text.length(); 546 String element; 547 List<String> result = new ArrayList<String>(); 548 char ch; 549 StringBuilder buffer = new StringBuilder(); 550 int intChar = r.read(); 551 int pos = 1; 552 if (intChar == -1 && !isAtStart) { 553 isEOF = true; 554 return null; 555 } 556 557 // @TODO better CRLF handling 558 isAtStart = false; 559 while (intChar != -1) { 560 ch = (char) intChar; 561 562 // System.out.println("pos " + pos + ", state=" + parseState + ", nextchar=" + ch + ", buf=" + buffer); 563 switch (parseState) { 564 case 0: 565 if (ch == '\n') { 566 // return result so far 567 element = buffer.toString(); 568 result.add(buffer.toString()); 569 return result; 570 } else if (Character.isSpaceChar(ch)) { 571 if (whitespaceSensitive) { 572 buffer.append(ch); 573 parseState = 1; 574 } else { 575 // ignore 576 } 577 } else if (ch == '"') { 578 parseState = 2; 579 } else if (ch == ',') { 580 result.add(""); // add an empty element; state remains unchanged 581 } else { 582 buffer.append(ch); 583 parseState = 1; 584 } 585 break; 586 case 1: 587 if (ch == '\n') { 588 // return result so far 589 element = buffer.toString(); 590 if (!whitespaceSensitive) { 591 element = element.trim(); 592 } 593 result.add(buffer.toString()); 594 return result; 595 } else if (ch == ',') { 596 element = buffer.toString(); 597 if (!whitespaceSensitive) { 598 element = element.trim(); 599 } 600 result.add(element); 601 buffer.setLength(0); 602 parseState = 0; 603 } else { 604 buffer.append(ch); 605 } 606 break; 607 case 2: 608 if (ch == '"') { 609 parseState = 3; 610 } else { 611 buffer.append(ch); 612 } 613 break; 614 case 3: 615 if (ch == '\n') { 616 result.add(buffer.toString()); 617 buffer.setLength(0); 618 parseState = 0; 619 return result; 620 } else if (ch == '"') { 621 buffer.append('"'); 622 parseState = 2; 623 } else if (ch == ',') { 624 result.add(buffer.toString()); 625 buffer.setLength(0); 626 parseState = 0; 627 } else if (Character.isSpaceChar(ch)) { 628 if (whitespaceSensitive) { 629 isEOF = true; 630 throw new ParseException("Cannot have trailing whitespace after close quote character", pos); 631 } 632 parseState = 4; 633 } else { 634 isEOF = true; 635 throw new ParseException("Cannot have trailing data after close quote character", pos); 636 } 637 break; 638 case 4: 639 if (ch == '\n') { 640 // return result so far 641 result.add(buffer.toString()); 642 return result; 643 } else if (Character.isSpaceChar(ch)) { 644 // consume and ignore 645 } else if (ch == ',') { 646 result.add(buffer.toString()); 647 buffer.setLength(0); 648 parseState = 0; 649 } else { 650 isEOF = true; 651 throw new ParseException("Cannot have trailing data after close quote character", pos); 652 } 653 break; 654 655 default: 656 throw new IllegalStateException("Illegal state '" + parseState + "' in parseCsv"); 657 } 658 659 intChar = r.read(); 660 pos++; 661 } 662 isEOF = true; 663 664 // if state is 2, we are in the middle of a quoted value 665 if (parseState == 2) { 666 throw new ParseException("Missing endquote in csv text", pos); 667 } 668 669 // otherwise we still need to add what's left in the buffer into the result list 670 element = buffer.toString(); 671 if (parseState == 1 && !whitespaceSensitive) { 672 element = element.trim(); 673 } 674 result.add(element); 675 return result; 676 } 677 }; 678 } 679 680 /** 681 * Equivalent to <code>parseCsv(text, false);</code> (i.e. whitespace-insensitive parsing). 682 * Refer to the documentation for that method for more details. 683 * 684 * @see #parseCsv(String, boolean) 685 * 686 * @param text he CSV-encoded string to parse 687 * 688 * @return a List of Strings. The returned List is guaranteed to always contain at least one element. 689 * 690 * @throws NullPointerException if the text passed to this method is null. 691 * @throws ParseException see {@link #parseCsv(String, boolean)} for details. 692 */ 693 static public List<String> parseCsv(String text) 694 throws ParseException { 695 return Text.parseCsv(text, false); 696 } 697 698 /** Returns a java-escaped string. Replaces '"' with '\"'. 699 * 700 * <p>Since this is predominantly used in the query builder, I am not worrying about 701 * unicode sequences (SWIFT is ASCII) or newlines (although this may be necessary later) 702 * for multiline textboxes 703 * 704 * @return The java-escaped version of the string 705 */ 706 public static String escapeJava(String string) { 707 return Text.replaceString(string, "\"", "\\\""); 708 } 709 710 /** Returns a javascript string. The characters <code>'</code>, 711 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents, 712 * 713 * <p>Non-printable characters are converted into unicode equivalents 714 ** 715 * <p>Newlines are now replaced with "\n" 716 * 717 * @return The java-escaped version of the string 718 */ 719 public static String escapeJavascript(String string) { 720 // backslashes are always escaped 721 //string = Text.replaceString(string, "\\", "\\u005C"); 722 //string = Text.replaceString(string, "\"", "\\u0022"); 723 //string = Text.replaceString(string, "'", "\\u0027"); 724 //string = Text.replaceString(string, "\n", "\\n"); 725 StringBuilder sb = new StringBuilder(string.length()); 726 for (int i = 0; i<string.length(); i++) { 727 char ch = string.charAt(i); 728 if (ch=='\n') { 729 sb.append("\\n"); 730 } else if (ch=='\\' || ch=='"' || ch=='\'' || ch<32 || ch>126) { 731 String hex = Integer.toString(ch, 16); 732 sb.append("\\u" + "0000".substring(0, 4-hex.length()) + hex); 733 } else { 734 sb.append(ch); 735 } 736 } 737 return scriptPattern.matcher(sb.toString()).replaceAll("\\\\u003C$1"); 738 // return sb.toString(); 739 } 740 741 742 /** Returns a javascript string. The characters <code>'</code>, 743 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents, 744 * 745 * <p>Non-printable characters are converted into unicode equivalents 746 * 747 * @deprecated use {@link #escapeJavascript(String)} instead 748 * 749 * @return The java-escaped version of the string 750 */ 751 public static String escapeJavascript2(String string) { 752 // this method only exists for backwards-compatability 753 string = reduceNewlines(string); // canonicalise CRLFs 754 return escapeJavascript(string); 755 } 756 757 758 /** Unescapes a java-escaped string. Replaces '\"' with '"', 759 * '\\u0022' with '"', '\\u0027' with ''', '\\u005C' with '\'. 760 * 761 * <p>Since this is predominantly used in the query builder, I am not worrying about 762 * unicode sequences (SWIFT is ASCII) or newlines (although this may be necessary later) 763 * for multiline textboxes 764 * 765 * @return The java-escaped version of the string 766 */ 767 public static String unescapeJava(String string) { 768 string = Text.replaceString(string, "\\\"", "\""); 769 string = Text.replaceString(string, "\\u0022", "\""); 770 string = Text.replaceString(string, "\\u0027", "'"); 771 string = Text.replaceString(string, "\\u005C", "\\"); 772 return string; 773 } 774 775 /** Returns a python string, escaped so that it can be enclosed in a single-quoted string. 776 * 777 * <p>The characters <code>'</code>, 778 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents, 779 * 780 * <p>Non-printable characters are converted into unicode equivalents 781 * 782 * @return The python-escaped version of the string 783 */ 784 public static String escapePython(String string) { 785 // pretty much the same as Text.escapeJavascript2(), without the reduceNewLines, which probably shouldn't be there anyway 786 string = Text.replaceString(string, "\\", "\\u005C"); 787 string = Text.replaceString(string, "\"", "\\u0022"); 788 string = Text.replaceString(string, "'", "\\u0027"); 789 string = Text.replaceString(string, "\n", "\\n"); 790 StringBuilder sb = new StringBuilder(string.length()); 791 for (int i = 0; i<string.length(); i++) { 792 char ch = string.charAt(i); 793 if (ch>=32 && ch<=126) { 794 sb.append(ch); 795 } else { 796 String hex = Integer.toString(ch, 16); 797 sb.append("\\u" + "0000".substring(0, 4-hex.length()) + hex); 798 } 799 } 800 return sb.toString(); 801 // return string; 802 } 803 804 /** Escape a filename or path component. 805 * Characters that typically have special meanings in paths (":", "/", "\") are escaped with a preceding "\" character. 806 * 807 * Does not escape glob characters ( "*" or "?" ). 808 * Do not use this method to escape a full file path; when escaping a file path, escape each path component separately and then join 809 * the components with "/" characters ( see {@link #createEscapedPath(String[])} ). 810 * 811 * @param string the filename or path component to escape 812 * 813 * @return the escaped form of the filename (or path component) 814 */ 815 // Does not escape DOS special filenames ( "NUL", "CON", "LPT1" etc ). Remember those ? Of course you do. 816 public static String escapePathComponent(String string) { 817 string = Text.replaceString(string, "\\", "\\\\"); 818 string = Text.replaceString(string, "/", "\\/"); 819 string = Text.replaceString(string, ":", "\\:"); 820 return string; 821 } 822 823 /** Unescape a filename or path component. 824 * The escape sequences "\\" , "\:" and "\/" are converted to "\", ":" and "/" respectively. 825 * All other escape sequences will raise an IllegalArgumentException 826 * 827 * <p>See {@link #splitEscapedPath(String)} to split an escaped path into components. 828 * 829 * @param pathComponent the filename or path component to unescape 830 * 831 * @return the unescaped form of the filename or path component 832 * 833 * @throws IllegalArgumentException if an unexpected escape is encountered, or the escape is unclosed 834 */ 835 public static String unescapePathComponent(String pathComponent) { 836 if (pathComponent == null) { 837 return null; 838 } 839 char c; 840 boolean inEscape = false; 841 StringBuilder sb = new StringBuilder(pathComponent.length()); 842 for (int i = 0; i < pathComponent.length(); i++) { 843 c = pathComponent.charAt(i); 844 if (inEscape) { 845 switch (c) { 846 case '\\': 847 case '/': // intentional fall-through 848 case ':': // intentional fall-through 849 sb.append(c); 850 break; 851 default: 852 throw new IllegalArgumentException("Unexpected escape '\\" + c + "' in filename"); 853 } 854 inEscape = false; 855 } else { 856 switch (c) { 857 case '\\': 858 inEscape = true; 859 break; 860 default: 861 sb.append(c); 862 } 863 } 864 } 865 if (inEscape) { 866 throw new IllegalArgumentException("Unclosed escape in filename"); 867 } 868 return sb.toString(); 869 } 870 871 // need to escape the \ in a regex ( \\ ) in a String ( \\\\ ) 872 private static Pattern splitPathPattern = Pattern.compile("(?<!\\\\)/"); 873 874 /** Split a path, but allow forward slashes in path components if they're escaped by a preceding '\' character. 875 * Individual path components returned by this method will be unescaped. 876 * 877 * <pre> 878 * splitPath(null) = NPE 879 * splitPath("") = [ "" ] 880 * splitPath("abc") = [ "abc" ] 881 * splitPath("abc/def/ghi") = [ "abc", "def", "ghi" ] 882 * splitPath("abc\\/def/ghi") = [ "abc/def", "ghi" ] 883 * </pre> 884 * 885 * <p>Opposite of {@link #createEscapedPath(String[])} 886 */ 887 public static String[] splitEscapedPath(String escapedPath) { 888 String[] result = splitPathPattern.split(escapedPath); 889 for (int i=0; i<result.length; i++) { 890 result[i] = Text.unescapePathComponent(result[i]); 891 } 892 return result; 893 } 894 895 /** Escapes the components of a path String, returning an escaped full path String. 896 * Each path component is escaped with {@link #escapePathComponent(String)} and then joined using '/' characters. 897 * 898 * <p>Opposite of {@link #splitEscapedPath(String)}. 899 * 900 * @param pathComponents the filename components 901 * @return an escaped path 902 */ 903 public static String createEscapedPath(String[] pathComponents) { 904 String result = null; 905 if (pathComponents.length == 0) { 906 throw new IllegalArgumentException("empty pathComponents"); 907 } 908 for (String c : pathComponents) { 909 if (c==null) { 910 throw new NullPointerException("null pathComponent"); 911 } 912 if (result == null) { 913 result = escapePathComponent(c); 914 } else { 915 result = result + "/" + escapePathComponent(c); 916 } 917 } 918 return result; 919 } 920 921 // escapeCss from ESAPI 2.0.1 922 private static final String[] esapi_hex = new String[256]; 923 static { 924 for ( char c = 0; c < 0xFF; c++ ) { 925 if ( c >= 0x30 && c <= 0x39 || c >= 0x41 && c <= 0x5A || c >= 0x61 && c <= 0x7A ) { 926 esapi_hex[c] = null; 927 } else { 928 esapi_hex[c] = toHex(c).intern(); 929 } 930 } 931 } 932 private static String toHex(char c) { 933 return Integer.toHexString(c); 934 } 935 private static String getHexForNonAlphanumeric(char c) { 936 if(c<0xFF) {return esapi_hex[c]; } 937 return toHex(c); 938 } 939 private static String encodeCssCharacter(Character c) { 940 String hex = getHexForNonAlphanumeric(c); 941 if ( hex == null ) { return "" + c; } 942 return "\\" + hex + " "; 943 } 944 945 /** 946 * Returns the CSS-escaped form of a string. 947 * 948 * <p>Characters outside of the printable ASCII range are converted to \nnnn form 949 * 950 * @param input the string to convert 951 * 952 * @return the HTML-escaped form of the string 953 */ 954 public static String escapeCss(String input) { 955 if (input == null) { return ""; } 956 StringBuilder sb = new StringBuilder(); 957 for (int i = 0; i < input.length(); i++) { 958 char c = input.charAt(i); 959 sb.append(encodeCssCharacter(c)); 960 } 961 return sb.toString(); 962 } 963 964 965 966 967 /** Returns the given string; but will truncate it to MAX_STRING_OUTPUT_CHARS. 968 * If it exceeds this length, a message is appended expressing how many 969 * characters were truncated. Strings with the key of 'exception' are 970 * not truncated (in order to display full stack traces when these occur). 971 * Any keys that contain the text 'password', 'Password', 'credential' or 972 * 'Credential' will be returned as eight asterisks. 973 * 974 * <p>This method is used in the debug JSP when dumping properties to the user, 975 * in order to prevent inordinately verbose output. 976 * 977 * @param key The key of the string we wish to display 978 * @param string The string value 979 * @return A (possibly truncated) version of this string 980 */ 981 public static String getDisplayString(String key, String string) { 982 return getDisplayString(key, string, MAX_STRING_OUTPUT_CHARS); 983 } 984 985 /** Returns the given string; but will truncate it to MAX_STRING_OUTPUT_CHARS. 986 * If it exceeds this length, a message is appended expressing how many 987 * characters were truncated. Strings with the key of 'exception' are 988 * not truncated (in order to display full stack traces when these occur). 989 * Any keys that contain the text 'password', 'Password', 'credential' or 990 * 'Credential' will be returned as eight asterisks. 991 * 992 * <p>This method is used in the debug JSP when dumping properties to the user, 993 * in order to prevent inordinately verbose output. 994 * 995 * @param key The key of the string we wish to display 996 * @param string The string value 997 * @param maxChars The maximum number of characters to display 998 * 999 * @return A (possibly truncated) version of this string 1000 */ 1001 public static String getDisplayString(String key, String string, int maxChars) { 1002 if (string == null) { 1003 string = "(null)"; 1004 } 1005 1006 if ("exception".equals(key)) { 1007 return string; 1008 } 1009 1010 if (key.indexOf("password") >= 0 || key.indexOf("Password") >= 0 || key.indexOf("credential") >= 0 || key.indexOf("Credential") >= 0) { 1011 return "********"; 1012 } 1013 1014 if (string.length() <= maxChars) { 1015 return string; 1016 } else { 1017 return string.substring(0, maxChars) + "... (" + (string.length() - maxChars) + " more characters truncated)"; 1018 } 1019 } 1020 1021 /** Utility function to return a default if the supplied string is null. 1022 * Shorthand for <code>(strText==null) ? strDefaultText : strText;</code> 1023 * 1024 * @return strText is strText is not null, otherwise strDefaultText 1025 */ 1026 public static String strDefault(String strText, String strDefaultText) { 1027 return (strText == null) ? strDefaultText : strText; 1028 } 1029 1030 /** Return a string composed of a series of strings, separated with the specified delimiter 1031 * 1032 * @param elements The array of elements to join 1033 * @return delimiter The delimiter to join each string with 1034 * 1035 * @throws NullPointerException if elements or delimiter is null 1036 */ 1037 public static String join(String[] elements, String delimiter) { 1038 return joinWithLast(elements, false, delimiter, delimiter); 1039 } 1040 1041 /** Return a string composed of a series of strings, separated with the specified delimiter 1042 * 1043 * @param elements A Collection or Iterable of the elements to join 1044 * @return delimiter The delimiter to join each string with 1045 * 1046 * @throws NullPointerException if elements or delimiter is null 1047 */ 1048 public static String join(Iterable<?> elements, String delimiter) { 1049 return joinWithLast(elements, false, delimiter, delimiter); 1050 } 1051 1052 /** Return a string composed of a series of strings, separated with the specified delimiter. 1053 * Each element is contained in single quotes. The final delimeter can be set to a different 1054 * value, to produce text in the form <code>"'a', 'b' or 'c'"</code> or <code>"'a', 'b' and 'c'"</code>. 1055 * 1056 * <p>There is no special handling of values containing quotes; see {@link #escapeCsv(String)} 1057 * 1058 * @param elements The array of elements to join 1059 * @param isQuoted If true, each element is surrounded by single quotes 1060 * @param delimiter The delimiter to join each string with 1061 * @param lastDelimiter The delimiter to join the second-last and last elements 1062 * 1063 * @throws NullPointerException if elements or delimiter is null 1064 */ 1065 public static String joinWithLast(String[] elements, boolean isQuoted, String delimiter, String lastDelimiter) { 1066 StringBuilder sb = new StringBuilder(); 1067 if (elements == null) { 1068 throw new NullPointerException("null elements"); 1069 } 1070 if (delimiter == null) { 1071 throw new NullPointerException("null delimiter"); 1072 } 1073 if (lastDelimiter == null) { 1074 throw new NullPointerException("null lastDelimiter"); 1075 } 1076 int len = elements.length; 1077 if (len == 0) { 1078 return ""; 1079 } 1080 1081 for (int i = 0; i < len - 1; i++) { 1082 if (isQuoted) { sb.append("'"); } 1083 sb.append(elements[i]); 1084 if (isQuoted) { sb.append("'"); } 1085 if (i == len - 2) { sb.append(lastDelimiter); } else { sb.append(delimiter); } 1086 } 1087 if (isQuoted) { sb.append("'"); } 1088 sb.append(elements[len - 1]); 1089 if (isQuoted) { sb.append("'"); } 1090 return sb.toString(); 1091 } 1092 1093 /** Return a string composed of a series of strings, separated with the specified delimiter 1094 * 1095 * <p>There is no special handling of values containing quotes; see {@link #escapeCsv(String)} 1096 * 1097 * @param elements A Collection or Iterable containing the elements to join 1098 * @param isQuoted If true, each element is surrounded by single quotes 1099 * @param delimiter The delimiter to join each string with 1100 * @param lastDelimiter The delimiter to join the second-last and last elements 1101 * 1102 * @throws NullPointerException if elements or delimiter is null 1103 * 1104 * @see #join(String[], String) 1105 */ 1106 public static String joinWithLast(Iterable<?> elements, boolean isQuoted, String delimiter, String lastDelimiter) { 1107 StringBuilder sb = new StringBuilder(); 1108 if (elements == null) { 1109 throw new NullPointerException("null elements"); 1110 } 1111 if (delimiter == null) { 1112 throw new NullPointerException("null delimiter"); 1113 } 1114 if (lastDelimiter == null) { 1115 throw new NullPointerException("null lastDelimiter"); 1116 } 1117 Iterator<?> i = elements.iterator(); 1118 if (!i.hasNext()) { return ""; } 1119 1120 Object thisEl = i.next(); 1121 while (i.hasNext()) { 1122 Object nextEl = i.next(); 1123 if (isQuoted) { sb.append("'"); } 1124 sb.append(thisEl); 1125 if (isQuoted) { sb.append("'"); } 1126 if (i.hasNext()) { 1127 sb.append(delimiter); 1128 } else { 1129 sb.append(lastDelimiter); 1130 } 1131 thisEl = nextEl; 1132 } 1133 if (isQuoted) { sb.append("'"); } 1134 sb.append(thisEl); 1135 if (isQuoted) { sb.append("'"); } 1136 1137 return sb.toString(); 1138 } 1139 1140 1141 1142 1143 /* 1144 * efficient search & replace ... stolen from Usenet: 1145 * http://groups.google.co.uk/groups?hl=en&lr=&selm=memo.19990629182431.344B%40none.crap 1146 */ 1147 1148 /** 1149 * An efficient search & replace routine. Replaces all instances of 1150 * searchString within str with replaceString. 1151 * 1152 * @param originalString The string to search 1153 * @param searchString The string to search for 1154 * @param replaceString The string to replace it with 1155 * 1156 */ 1157 public static String replaceString(String originalString, String searchString, String replaceString) { 1158 if (replaceString == null) { 1159 return originalString; 1160 } 1161 1162 if (searchString == null) { 1163 return originalString; 1164 } 1165 1166 if (originalString == null) { 1167 return null; 1168 } 1169 1170 int loc = originalString.indexOf(searchString); 1171 1172 if (loc == -1) { 1173 return originalString; 1174 } 1175 1176 char[] src = originalString.toCharArray(); 1177 int n = searchString.length(); 1178 int m = originalString.length(); 1179 StringBuilder buf = new StringBuilder(m + replaceString.length() - n); 1180 int start = 0; 1181 1182 do { 1183 if (loc > start) { 1184 buf.append(src, start, loc - start); 1185 } 1186 1187 buf.append(replaceString); 1188 start = loc + n; 1189 loc = originalString.indexOf(searchString, start); 1190 } while (loc > 0); 1191 1192 if (start < m) { 1193 buf.append(src, start, m - start); 1194 } 1195 1196 return buf.toString(); 1197 } 1198 1199 /** 1200 * Reads a file, and returns its contents in a String 1201 * 1202 * @param filename The file to read 1203 * 1204 * @return The contents of the string, 1205 * 1206 * @throws IOException A problem occurred whilst attempting to read the string 1207 */ 1208 public static String getFileContents(String filename) 1209 throws IOException { 1210 File file = new File(filename); 1211 FileInputStream fis = new FileInputStream(file); 1212 byte[] data = new byte[(int) file.length()]; 1213 int len = fis.read(data); 1214 fis.close(); 1215 if (len < file.length()) { 1216 /* this should never happen -- file has changed underneath us */ 1217 throw new IOException("Buffer read != size of file"); 1218 } 1219 1220 return new String(data); 1221 } 1222 1223 /** 1224 * Reads a file, and returns its contents in a String. Identical to calling 1225 * <code>getFileContents(projectFile.getCanonicalPath())</code>. 1226 * 1227 * @param file The file to read 1228 * 1229 * @return The contents of the string, 1230 * @throws IOException 1231 * 1232 * @throws IOException A problem occurred whilst attempting to read the string 1233 */ 1234 public static String getFileContents(File file) throws IOException { 1235 return getFileContents(file.getCanonicalPath()); 1236 } 1237 1238 1239 /** 1240 * Prefixes every lines supplied with a given indent. e.g. 1241 * <code>indent("\t", "abcd\nefgh")</code> would return "\tabcd\n\tefgh". If the 1242 * string ends in a newline, then the return value also ends with a newline. 1243 * 1244 * @param indentString The characters to indent with. Usually spaces or tabs, 1245 * but could be something like a timestamp. 1246 * @param originalString The string to indent. 1247 * @return The originalString, with every line (as separated by the newline 1248 * character) prefixed with indentString. 1249 */ 1250 static public String indent(String indentString, String originalString) { 1251 String allButLastChar; 1252 if (originalString == null || indentString == null) { 1253 throw new NullPointerException(); 1254 } 1255 if (originalString.equals("")) { 1256 return indentString; 1257 } 1258 allButLastChar = originalString.substring(0, originalString.length() - 1); 1259 return indentString + replaceString(allButLastChar, "\n", "\n" + indentString) + originalString.substring(originalString.length() - 1); 1260 } 1261 1262 /** Ensure that a string is padded with spaces so that it meets the 1263 * required length. If the input string exceeds this length, this it 1264 * is returned unchanged 1265 * 1266 * @param inputString the string to pad 1267 * @param length the desired length 1268 * @param justification a JUSTIFICATION_* constant defining whether left or 1269 * right justification is required. 1270 * 1271 * @return a padded string. 1272 */ 1273 static public String pad(String inputString, int length, int justification) { 1274 // @TODO not terribly efficient, but who cares 1275 switch (justification) { 1276 case JUSTIFICATION_LEFT: 1277 while (inputString.length() < length) { 1278 inputString = inputString + " "; 1279 } 1280 break; 1281 1282 case JUSTIFICATION_RIGHT: 1283 while (inputString.length() < length) { 1284 inputString = " " + inputString; 1285 } 1286 break; 1287 1288 case JUSTIFICATION_CENTER: 1289 while (inputString.length() < length) { 1290 inputString = inputString + " "; 1291 if (inputString.length() < length) { 1292 inputString = " " + inputString; 1293 } 1294 } 1295 break; 1296 } 1297 return inputString; 1298 } 1299 1300 /** Given a period-separated list of components (e.g. variable references ("a.b.c") or classnames), 1301 * returns the last component. For example, 1302 * getLastComponent("com.randomnoun.common.util.Text") will return "Text". 1303 * 1304 * <p>If component is null, this function returns null. 1305 * <p>If component contains no periods, this function returns the original string. 1306 * 1307 * @param string The string to retrieve the last component from 1308 */ 1309 static public String getLastComponent(String string) { 1310 if (string == null) { 1311 return null; 1312 } 1313 if (string.indexOf('.') == -1) { 1314 return string; 1315 } 1316 return string.substring(string.lastIndexOf('.') + 1); 1317 } 1318 1319 /** Escape this supplied string so it can represent a 'name' or 'value' component 1320 * on a HTTP queryString. This generally involves escaping special characters into %xx 1321 * form. Note that this only works for US-ASCII data. 1322 * 1323 */ 1324 public static String escapeQueryString(String unescapedQueryString) { 1325 // default encoding 1326 byte[] data = encodeUrl(allowed_within_query, unescapedQueryString.getBytes()); 1327 1328 try { 1329 return new String(data, "US-ASCII"); 1330 } catch (UnsupportedEncodingException e) { 1331 throw new RuntimeException("encodeQueryString() requires ASCII support"); 1332 } 1333 } 1334 1335 /** 1336 * Encodes an array of bytes into an array of URL safe 7-bit 1337 * characters. Unsafe characters are escaped. 1338 * 1339 * @param urlsafe bitset of characters deemed URL safe 1340 * @param bytes array of bytes to convert to URL safe characters 1341 * @return array of bytes containing URL safe characters 1342 */ 1343 private static final byte[] encodeUrl(BitSet urlsafe, byte[] bytes) { 1344 if (bytes == null) { 1345 return null; 1346 } 1347 1348 if (urlsafe == null) { 1349 throw new NullPointerException("null urlsafe"); 1350 } 1351 1352 ByteArrayOutputStream buffer = new ByteArrayOutputStream(); 1353 1354 for (int i = 0; i < bytes.length; i++) { 1355 int b = bytes[i]; 1356 1357 if (b < 0) { 1358 b = 256 + b; 1359 } 1360 1361 if (urlsafe.get(b)) { 1362 if (b == ' ') { 1363 b = '+'; 1364 } 1365 1366 buffer.write(b); 1367 } else { 1368 buffer.write('%'); 1369 1370 char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)); 1371 char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16)); 1372 1373 buffer.write(hex1); 1374 buffer.write(hex2); 1375 } 1376 } 1377 1378 return buffer.toByteArray(); 1379 } 1380 1381 /** 1382 * Encodes a string into Base64 format. 1383 * No blanks or line breaks are inserted. 1384 * @param s a String to be encoded. 1385 * @return A String with the Base64 encoded data. 1386 */ 1387 public static String encodeBase64(String s) { 1388 return new String(encodeBase64(s.getBytes())); 1389 } 1390 1391 /** 1392 * Encodes a byte array into Base64 format. 1393 * No blanks or line breaks are inserted. 1394 * @param in an array containing the data bytes to be encoded. 1395 * @return A character array with the Base64 encoded data. 1396 */ 1397 public static char[] encodeBase64(byte[] in) { 1398 int iLen = in.length; 1399 int oDataLen = (iLen * 4 + 2) / 3; // output length without padding 1400 int oLen = ((iLen + 2) / 3) * 4; // output length including padding 1401 char[] out = new char[oLen]; 1402 int ip = 0; 1403 int op = 0; 1404 1405 while (ip < iLen) { 1406 int i0 = in[ip++] & 0xff; 1407 int i1 = ip < iLen ? in[ip++] & 0xff : 0; 1408 int i2 = ip < iLen ? in[ip++] & 0xff : 0; 1409 int o0 = i0 >>> 2; 1410 int o1 = ((i0 & 3) << 4) | (i1 >>> 4); 1411 int o2 = ((i1 & 0xf) << 2) | (i2 >>> 6); 1412 int o3 = i2 & 0x3F; 1413 out[op++] = map1[o0]; 1414 out[op++] = map1[o1]; 1415 out[op] = op < oDataLen ? map1[o2] : '='; 1416 op++; 1417 out[op] = op < oDataLen ? map1[o3] : '='; 1418 op++; 1419 } 1420 return out; 1421 } 1422 1423 /** Used by {@link #parseData(String) to parse dates generated in Codec output. 1424 * (These dates are generated using the standard Java .toString() method, which 1425 * probably changes depending on the VM's locale, which I'm going to ignore for 1426 * the time being). 1427 */ 1428 static class DateParser { 1429 1430 /** Parse a date generated by Date.toString() into a Date object 1431 * 1432 * @param dateString a string representation of a date 1433 * @return a Date representation of a date 1434 */ 1435 public static Date valueOf(String dateString) { 1436 SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy"); 1437 try { 1438 return sdf.parse(dateString); 1439 } catch (ParseException pe) { 1440 throw (IllegalArgumentException) new IllegalArgumentException("Invalid date '" + dateString + "'").initCause(pe); 1441 } 1442 } 1443 } 1444 1445 // ---------------------- Generous characters for each component validation 1446 // -- not much of this is used in this class, so I should shorten these definitions, 1447 // but you never know, I might use it later, so it's here for the time being. 1448 // 1449 // compiled from 1450 // org.apache.commons.httpclient.util.URIUtil 1451 // org.apache.commons.codec.net.URLCodec 1452 // org.apache.commons.httpclient.util.EncodingUtil 1453 // org.apache.commons.httpclient.URI 1454 // 1455 // trust me... just calling escapeQueryString() is *so* much easier. 1456 private static final BitSet percent = new BitSet(256); // escape % as %25 1457 private static final BitSet digit = new BitSet(256); // 0-9 1458 private static final BitSet alpha = new BitSet(256); // lowalpha | upalpha 1459 private static final BitSet alphanum = new BitSet(256); // alpha | digit 1460 private static final BitSet hex = new BitSet(256); // digit | a-f | A-F 1461 private static final BitSet escaped = new BitSet(256); // "%" hex hex 1462 private static final BitSet mark = new BitSet(256); // -_.!~*'() 1463 private static final BitSet unreserved = new BitSet(256); 1464 1465 // alphanum | mark (URI allowed, no purpose) 1466 private static final BitSet reserved = new BitSet(256); // ;/?:"&=+$, 1467 private static final BitSet uric = new BitSet(256); 1468 1469 // reserved | unreserved | escaped 1470 private static final BitSet allowed_query = new BitSet(256); // uric - % 1471 private static final BitSet allowed_within_query = new BitSet(256); 1472 1473 /** Mapping table from 6-bit nibble to Base64 characters */ 1474 private static char[] map1 = new char[64]; 1475 1476 // NB: www-form-encoding appears to be alpha | numeric | -_.* ( + space) 1477 static { 1478 percent.set('%'); 1479 1480 for (int i = '0'; i <= '9'; i++) { 1481 digit.set(i); 1482 } 1483 1484 for (int i = 'a'; i <= 'z'; i++) { 1485 alpha.set(i); 1486 } 1487 1488 for (int i = 'A'; i <= 'Z'; i++) { 1489 alpha.set(i); 1490 } 1491 1492 alphanum.or(alpha); 1493 alphanum.or(digit); 1494 hex.or(digit); 1495 1496 for (int i = 'a'; i <= 'f'; i++) { 1497 hex.set(i); 1498 } 1499 1500 for (int i = 'A'; i <= 'F'; i++) { 1501 hex.set(i); 1502 } 1503 1504 escaped.or(percent); 1505 escaped.or(hex); 1506 mark.set('-'); 1507 mark.set('_'); 1508 mark.set('.'); 1509 mark.set('!'); 1510 mark.set('~'); 1511 mark.set('*'); 1512 mark.set('\''); 1513 mark.set('('); 1514 mark.set(')'); 1515 reserved.set(';'); 1516 reserved.set('/'); 1517 reserved.set('?'); 1518 reserved.set(':'); 1519 reserved.set('@'); 1520 reserved.set('&'); 1521 reserved.set('='); 1522 reserved.set('+'); 1523 reserved.set('$'); 1524 reserved.set(','); 1525 unreserved.or(alphanum); 1526 unreserved.or(mark); 1527 uric.or(reserved); 1528 uric.or(unreserved); 1529 uric.or(escaped); 1530 allowed_query.or(uric); 1531 allowed_query.clear('%'); 1532 allowed_within_query.or(allowed_query); 1533 allowed_within_query.andNot(reserved); 1534 1535 1536 // excluded 'reserved' 1537 // create map1 array 1538 int i = 0; 1539 for (char c = 'A'; c <= 'Z'; c++) { 1540 map1[i++] = c; 1541 } 1542 for (char c = 'a'; c <= 'z'; c++) { 1543 map1[i++] = c; 1544 } 1545 for (char c = '0'; c <= '9'; c++) { 1546 map1[i++] = c; 1547 } 1548 map1[i++] = '+'; 1549 map1[i++] = '/'; 1550 1551 } 1552 1553 1554 1555 /** 1556 * Returns a comparator that compares contained numbers based on their numeric values and compares other parts 1557 * using the current locale's order rules. 1558 * <p>For example in German locale this will be a comparator that handles umlauts correctly and ignores 1559 * upper/lower case differences.</p> 1560 * 1561 * @return <p>A string comparator that uses the current locale's order rules and handles embedded numbers 1562 * correctly.</p> 1563 */ 1564 public static Comparator<String> getNaturalComparator() { 1565 final Collator collator = Collator.getInstance(); 1566 return new Comparator<String>() { 1567 public int compare(String o1, String o2) { 1568 return compareNatural(collator, o1, o2); 1569 } 1570 }; 1571 } 1572 1573 /** 1574 * <p>Compares two strings using the current locale's rules and comparing contained numbers based on their numeric 1575 * values.</p> 1576 * <p>This is probably the best default comparison to use.</p> 1577 * <p>If you know that the texts to be compared are in a certain language that differs from the default locale's 1578 * langage, then get a collator for the desired locale ({@link java.text.Collator#getInstance(java.util.Locale)}) 1579 * and pass it to {@link #compareNatural(java.text.Collator, String, String)}</p> 1580 * 1581 * @param s first string 1582 * @param t second string 1583 * @return zero iff <code>s</code> and <code>t</code> are equal, 1584 * a value less than zero iff <code>s</code> lexicographically precedes <code>t</code> 1585 * and a value larger than zero iff <code>s</code> lexicographically follows <code>t</code> 1586 */ 1587 public static int compareNatural(Collator collator, String s, String t) { 1588 return compareNatural(s, t, false, collator); 1589 } 1590 1591 1592 /** Natural compare operation. Stolen from 1593 * http://www.eekboom.com/java/compareNatural/src/com/eekboom/utils/Strings.java 1594 * (source file is under BSD license). 1595 * 1596 * @param s first string 1597 * @param t second string 1598 * @param caseSensitive treat characters differing in case only as equal - will be ignored if a collator is given 1599 * @param collator used to compare subwords that aren't numbers - if null, characters will be compared 1600 * individually based on their Unicode value 1601 * @return zero iff <code>s</code> and <code>t</code> are equal, 1602 * a value less than zero iff <code>s</code> lexicographically precedes <code>t</code> 1603 * and a value larger than zero iff <code>s</code> lexicographically follows <code>t</code> 1604 */ 1605 private static int compareNatural(String s, String t, boolean caseSensitive, Collator collator) { 1606 int sIndex = 0; 1607 int tIndex = 0; 1608 1609 int sLength = s.length(); 1610 int tLength = t.length(); 1611 1612 while(true) { 1613 // both character indices are after a subword (or at zero) 1614 1615 // Check if one string is at end 1616 if(sIndex == sLength && tIndex == tLength) { 1617 return 0; 1618 } 1619 if(sIndex == sLength) { 1620 return -1; 1621 } 1622 if(tIndex == tLength) { 1623 return 1; 1624 } 1625 1626 // Compare sub word 1627 char sChar = s.charAt(sIndex); 1628 char tChar = t.charAt(tIndex); 1629 1630 boolean sCharIsDigit = Character.isDigit(sChar); 1631 boolean tCharIsDigit = Character.isDigit(tChar); 1632 1633 if(sCharIsDigit && tCharIsDigit) { 1634 // Compare numbers 1635 1636 // skip leading 0s 1637 int sLeadingZeroCount = 0; 1638 while(sChar == '0') { 1639 ++sLeadingZeroCount; 1640 ++sIndex; 1641 if(sIndex == sLength) { 1642 break; 1643 } 1644 sChar = s.charAt(sIndex); 1645 } 1646 int tLeadingZeroCount = 0; 1647 while(tChar == '0') { 1648 ++tLeadingZeroCount; 1649 ++tIndex; 1650 if(tIndex == tLength) { 1651 break; 1652 } 1653 tChar = t.charAt(tIndex); 1654 } 1655 boolean sAllZero = sIndex == sLength || !Character.isDigit(sChar); 1656 boolean tAllZero = tIndex == tLength || !Character.isDigit(tChar); 1657 if(sAllZero && tAllZero) { 1658 continue; 1659 } 1660 if(sAllZero && !tAllZero) { 1661 return -1; 1662 } 1663 if(tAllZero) { 1664 return 1; 1665 } 1666 1667 int diff = 0; 1668 do { 1669 if(diff == 0) { 1670 diff = sChar - tChar; 1671 } 1672 ++sIndex; 1673 ++tIndex; 1674 if(sIndex == sLength && tIndex == tLength) { 1675 return diff != 0 ? diff : sLeadingZeroCount - tLeadingZeroCount; 1676 } 1677 if(sIndex == sLength) { 1678 if(diff == 0) { 1679 return -1; 1680 } 1681 return Character.isDigit(t.charAt(tIndex)) ? -1 : diff; 1682 } 1683 if(tIndex == tLength) { 1684 if(diff == 0) { 1685 return 1; 1686 } 1687 return Character.isDigit(s.charAt(sIndex)) ? 1 : diff; 1688 } 1689 sChar = s.charAt(sIndex); 1690 tChar = t.charAt(tIndex); 1691 sCharIsDigit = Character.isDigit(sChar); 1692 tCharIsDigit = Character.isDigit(tChar); 1693 if(!sCharIsDigit && !tCharIsDigit) { 1694 // both number sub words have the same length 1695 if(diff != 0) { 1696 return diff; 1697 } 1698 break; 1699 } 1700 if(!sCharIsDigit) { 1701 return -1; 1702 } 1703 if(!tCharIsDigit) { 1704 return 1; 1705 } 1706 } while(true); 1707 } 1708 else { 1709 // Compare words 1710 if(collator != null) { 1711 // To use the collator the whole subwords have to be compared - character-by-character comparision 1712 // is not possible. So find the two subwords first 1713 int aw = sIndex; 1714 int bw = tIndex; 1715 do { 1716 ++sIndex; 1717 } while(sIndex < sLength && !Character.isDigit(s.charAt(sIndex))); 1718 do { 1719 ++tIndex; 1720 } while(tIndex < tLength && !Character.isDigit(t.charAt(tIndex))); 1721 1722 String as = s.substring(aw, sIndex); 1723 String bs = t.substring(bw, tIndex); 1724 int subwordResult = collator.compare(as, bs); 1725 if(subwordResult != 0) { 1726 return subwordResult; 1727 } 1728 } 1729 else { 1730 // No collator specified. All characters should be ascii only. Compare character-by-character. 1731 do { 1732 if(sChar != tChar) { 1733 if(caseSensitive) { 1734 return sChar - tChar; 1735 } 1736 sChar = Character.toUpperCase(sChar); 1737 tChar = Character.toUpperCase(tChar); 1738 if(sChar != tChar) { 1739 sChar = Character.toLowerCase(sChar); 1740 tChar = Character.toLowerCase(tChar); 1741 if(sChar != tChar) { 1742 return sChar - tChar; 1743 } 1744 } 1745 } 1746 ++sIndex; 1747 ++tIndex; 1748 if(sIndex == sLength && tIndex == tLength) { 1749 return 0; 1750 } 1751 if(sIndex == sLength) { 1752 return -1; 1753 } 1754 if(tIndex == tLength) { 1755 return 1; 1756 } 1757 sChar = s.charAt(sIndex); 1758 tChar = t.charAt(tIndex); 1759 sCharIsDigit = Character.isDigit(sChar); 1760 tCharIsDigit = Character.isDigit(tChar); 1761 } while(!sCharIsDigit && !tCharIsDigit); 1762 } 1763 } 1764 } 1765 } 1766 1767 1768 // taken from the W3C Jigsaw server sourcecode; class org.w3c.jigsaw.http.Request#unescape(String) 1769 /** 1770 * Unescape a HTTP escaped string 1771 * @param s The string to be unescaped 1772 * @return the unescaped string. 1773 */ 1774 public static String unescapeQueryString (String s) { 1775 StringBuilder sbuf = new StringBuilder() ; 1776 int len = s.length() ; 1777 int ch = -1 ; 1778 for (int i = 0 ; i < len ; i++) { 1779 switch (ch = s.charAt(i)) { 1780 case '%': 1781 if (i < len - 2) { 1782 // @TODO check to see how illegal escapes are treated 1783 // e.g. "%nothex" 1784 ch = s.charAt (++i) ; 1785 int hb = (Character.isDigit ((char) ch) 1786 ? ch - '0' 1787 : 10+Character.toLowerCase ((char) ch)-'a') & 0xF ; 1788 ch = s.charAt (++i) ; 1789 int lb = (Character.isDigit ((char) ch) 1790 ? ch - '0' 1791 : 10+Character.toLowerCase ((char) ch)-'a') & 0xF ; 1792 sbuf.append ((char) ((hb << 4) | lb)) ; 1793 } else { 1794 sbuf.append ('%'); // hit EOL, just leave as is 1795 } 1796 break ; 1797 case '+': 1798 sbuf.append (' ') ; 1799 break ; 1800 default: 1801 sbuf.append ((char) ch) ; 1802 } 1803 } 1804 return sbuf.toString() ; 1805 } 1806 1807 /** Returns the largest common prefix between two other strings; e.g. 1808 * getCommonPrefix("abcsomething", "abcsometharg") would be "abcsometh". 1809 * 1810 * @param string1 String number one 1811 * @param string2 String number two 1812 * 1813 * @return the large common prefix between the two strings 1814 * 1815 * @throws NullPointerException is string1 or string2 is null 1816 */ 1817 public static String getCommonPrefix(String string1, String string2) { 1818 if (string1==null) { throw new NullPointerException("null string1"); } 1819 if (string2==null) { throw new NullPointerException("null string2"); } 1820 int c = 0; 1821 int maxLen = Math.min(string1.length(), string2.length()); 1822 1823 while (c < maxLen && string1.charAt(c)==string2.charAt(c)) { 1824 c++; 1825 } 1826 return string1.substring(0, c); 1827 } 1828 1829 /** Uppercases the first character of a string. 1830 * 1831 * @param text text to modify 1832 * 1833 * @return the supplied text, with the first character converted to uppercase. 1834 */ 1835 static public String toFirstUpper(String text) { 1836 return Character.toUpperCase(text.charAt(0)) + text.substring(1); 1837 } 1838 1839 1840 /** Lowercases the first character of a string. 1841 * 1842 * @param text text to modify 1843 * 1844 * @return the supplied text, with the first character converted to lowercase. 1845 */ 1846 static public String toFirstLower(String text) { 1847 return Character.toLowerCase(text.charAt(0)) + text.substring(1); 1848 } 1849 1850 1851 1852 1853 /** Number of character edits between two strings; taken from 1854 * http://www.merriampark.com/ldjava.htm. There's a version in commongs-lang, 1855 * apparently, but according to the comments on that page, it uses O(n^2) memory, 1856 * which can't be good. 1857 * 1858 * @param s string 1 1859 * @param t string 2 1860 * 1861 * @return the smallest number of edits required to convert s into t 1862 */ 1863 public static int getLevenshteinDistance (String s, String t) { 1864 if (s == null || t == null) { 1865 throw new IllegalArgumentException("Strings must not be null"); 1866 } 1867 1868 /* 1869 The difference between this impl. and the previous is that, rather 1870 than creating and retaining a matrix of size s.length()+1 by t.length()+1, 1871 we maintain two single-dimensional arrays of length s.length()+1. The first, d, 1872 is the 'current working' distance array that maintains the newest distance cost 1873 counts as we iterate through the characters of String s. Each time we increment 1874 the index of String t we are comparing, d is copied to p, the second int[]. Doing so 1875 allows us to retain the previous cost counts as required by the algorithm (taking 1876 the minimum of the cost count to the left, up one, and diagonally up and to the left 1877 of the current cost count being calculated). (Note that the arrays aren't really 1878 copied anymore, just switched...this is clearly much better than cloning an array 1879 or doing a System.arraycopy() each time through the outer loop.) 1880 1881 Effectively, the difference between the two implementations is this one does not 1882 cause an out of memory condition when calculating the LD over two very large strings. 1883 */ 1884 1885 int n = s.length(); // length of s 1886 int m = t.length(); // length of t 1887 1888 if (n == 0) { 1889 return m; 1890 } else if (m == 0) { 1891 return n; 1892 } 1893 1894 int p[] = new int[n+1]; //'previous' cost array, horizontally 1895 int d[] = new int[n+1]; // cost array, horizontally 1896 int _d[]; //placeholder to assist in swapping p and d 1897 1898 // indexes into strings s and t 1899 int i; // iterates through s 1900 int j; // iterates through t 1901 1902 char t_j; // jth character of t 1903 1904 int cost; // cost 1905 1906 for (i = 0; i<=n; i++) { 1907 p[i] = i; 1908 } 1909 1910 for (j = 1; j<=m; j++) { 1911 t_j = t.charAt(j-1); 1912 d[0] = j; 1913 1914 for (i=1; i<=n; i++) { 1915 cost = s.charAt(i-1)==t_j ? 0 : 1; 1916 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost 1917 d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost); 1918 } 1919 1920 // copy current distance counts to 'previous row' distance counts 1921 _d = p; 1922 p = d; 1923 d = _d; 1924 } 1925 1926 // our last action in the above loop was to switch d and p, so p now 1927 // actually has the most recent cost counts 1928 return p[n]; 1929 } 1930 1931 /** Return the md5 hash of a string 1932 * 1933 * @param text text to hash 1934 * 1935 * @return a hex-encoded version of the MD5 hash 1936 * 1937 * @throws IllegalStateException if the java installation in use doesn't know 1938 * about MD5 1939 */ 1940 static public String getMD5(String text) { 1941 try{ 1942 MessageDigest algorithm = MessageDigest.getInstance("MD5"); 1943 algorithm.reset(); 1944 // algorithm.update(defaultBytes); 1945 algorithm.update(text.getBytes()); 1946 byte messageDigest[] = algorithm.digest(); 1947 1948 StringBuilder hexString = new StringBuilder(); 1949 for (int i=0;i<messageDigest.length;i++) { 1950 hexString.append(Integer.toHexString(0xFF & messageDigest[i])); 1951 } 1952 return hexString.toString(); 1953 } catch (NoSuchAlgorithmException nsae) { 1954 throw (IllegalStateException) new IllegalStateException("Unknown algorithm 'MD5'").initCause(nsae); 1955 } 1956 } 1957 1958 /** Returns a string composed of the supplied text, repeated 0 or more times 1959 * 1960 * @param text text to repeat 1961 * @param count number of repetitions 1962 * 1963 * @return the repeated text 1964 */ 1965 static public String repeat(String text, int count) { 1966 StringBuffer sb = new StringBuffer(); 1967 for (int i=0; i<count; i++) { 1968 sb.append(text); 1969 } 1970 return sb.toString(); 1971 } 1972 1973 1974 /** Perform ${xxxx}-style substitution of placeholders in text. Placeholders without 1975 * values will be left as-is. 1976 * 1977 * <p>For example, gives the set of variables: 1978 * <ul> 1979 * <li>abc = def 1980 * </ul> 1981 * 1982 * <p>then the result of <code>substituteParameters("xxxx${abc}yyyy${def}zzzz")</code> 1983 * will be "xxxxdefyyyy${def}zzzz" 1984 * 1985 * <p><code>$</code> followed by any other character will be left as-is. 1986 * 1987 * @param variables a set of variable names and values, used in the substitution 1988 * @param text the text to be substituted. 1989 * 1990 * @return text, with placeholders replaced with values in the variables parameter 1991 */ 1992 public static String substitutePlaceholders(Map<?, ?> variables, String text) { 1993 // escaped version of (\$\{.*?\}|[^$]+|\$.) 1994 Pattern p = Pattern.compile("(\\$\\{.*?\\}|[^$]+|\\$)"); // modified regex 1995 Matcher m = p.matcher(text); 1996 String result = ""; 1997 while (m.find()) { 1998 String token = m.group(1); 1999 if (token.startsWith("${") && token.endsWith("}")) { 2000 Object value = variables.get(token.substring(2, token.length()-1)); 2001 if (value == null) { 2002 result = result + token; 2003 } else { 2004 result = result + value.toString(); 2005 } 2006 } else { 2007 result = result + token; 2008 } 2009 } 2010 return result; 2011 } 2012 2013 2014}