1 package com.randomnoun.common;
2
3 /* (c) 2013 randomnoun. All Rights Reserved. This work is licensed under a
4 * BSD Simplified License. (http://www.randomnoun.com/bsd-simplified.html)
5 */
6 import java.io.ByteArrayOutputStream;
7 import java.io.File;
8 import java.io.FileInputStream;
9 import java.io.IOException;
10 import java.io.Reader;
11 import java.io.UnsupportedEncodingException;
12 import java.security.MessageDigest;
13 import java.security.NoSuchAlgorithmException;
14 import java.text.Collator;
15 import java.text.ParseException;
16 import java.text.SimpleDateFormat;
17 import java.util.ArrayList;
18 import java.util.BitSet;
19 import java.util.Comparator;
20 import java.util.Date;
21 import java.util.Iterator;
22 import java.util.List;
23 import java.util.Map;
24 import java.util.regex.Matcher;
25 import java.util.regex.Pattern;
26
27 /** Text utility functions
28 *
29 * @author knoxg
30 */
31 public class Text {
32
33 /** Used to prevent massive debug dumps. See {@link #getDisplayString(String, String)} */
34 private static final int MAX_STRING_OUTPUT_CHARS = 300;
35
36 /** Left-justification constant for use in the {@link #pad(String, int, int)} method */
37 public static final int JUSTIFICATION_LEFT = 0;
38
39 /** Center-justification constant for use in the {@link #pad(String, int, int)} method */
40 public static final int JUSTIFICATION_CENTER = 1;
41
42 /** Right-justification constant for use in the {@link #pad(String, int, int)} method */
43 public static final int JUSTIFICATION_RIGHT = 2;
44
45 public static Pattern scriptPattern = Pattern.compile("<(/script)", Pattern.CASE_INSENSITIVE);
46
47 /** Returns true if the supplied string is null or the empty string, false otherwise
48 *
49 * @param text The string to test
50 * @return true if the supplied string is null or the empty string, false otherwise
51 */
52 public static boolean isBlank(String text) {
53 return (text == null || text.equals(""));
54 }
55
56 /** Returns true if the supplied string is non-null and only contains numeric characters
57 *
58 * @param text The string to test
59 * @return true if the supplied string is non-null and only contains numeric characters
60 */
61 public static boolean isNumeric(String text) {
62 if (text == null) {
63 return false;
64 }
65 char ch;
66 for (int i = 0; i < text.length(); i++) {
67 ch = text.charAt(i);
68 if (ch < '0' || ch > '9') {
69 return false;
70 }
71 }
72 return true;
73 }
74
75 /** Returns true if the supplied string is non-null and only contains numeric characters
76 * or a single decimal point. The value can have a leading negative ('-') symbol.
77 *
78 * @param text The string to test
79 * @return true if the supplied string is non-null and only contains numeric characters,
80 * which may contain a '.' character in there somewhere.
81 */
82 public static boolean isNumericDecimal(String text) {
83 if (text == null) {
84 return false;
85 }
86 boolean seenPoint = false; // existential quandary there for you
87 char ch;
88 int len = text.length();
89 for (int i = 0; i < len; i++) {
90 ch = text.charAt(i);
91 if (ch=='.') {
92 if (seenPoint) { return false; }
93 seenPoint = true;
94 } else if (ch == '-' && i == 0) {
95 // leading negative symbol OK
96 if (len == 1) {
97 // but not if it's the only character in the string
98 return false;
99 }
100 } else if (ch < '0' || ch > '9') {
101 return false;
102 }
103 }
104 return true;
105 }
106
107 /** Returns true if the supplied string is non-null and only contains numeric characters
108 * or a single decimal point. The value can have a leading negative ('-') symbol.
109 *
110 * This version allows exponents ("E+nn" or "E-nn") to the end of the value.
111 *
112 * @param text The string to test
113 * @return true if the supplied string is non-null and only contains numeric characters,
114 * which may contain a '.' character in there somewhere.
115 */
116 public static boolean isNumericDecimalExp(String text) {
117 if (text == null) {
118 return false;
119 }
120 boolean seenPoint = false; // existential quandary there for you
121 int expPos = -1; // position of the 'E' character
122 char ch;
123 for (int i = 0; i < text.length(); i++) {
124 ch = text.charAt(i);
125 if (ch=='E') {
126 if (expPos != -1) { return false; }
127 expPos = i;
128 } else if (ch=='.' && expPos == -1) {
129 if (seenPoint) { return false; }
130 seenPoint = true;
131 } else if ((ch == '+' || ch == '-') && i == expPos + 1) {
132 // + or - directly after 'E' OK
133 } else if (ch == '-' && i == 0) {
134 // leading negative symbol OK
135 } else if (ch < '0' || ch > '9') {
136 return false;
137 }
138 }
139 return true;
140 }
141
142
143 /** Ensures that a string returned from a browser (on any platform) conforms
144 * to unix line-EOF conventions. Any instances of consecutive CRs (<code>0xD</code>)
145 * and LFs (<code>0xA</code>) in a string will be reduced to a series of CRs (the number of CRs will be the
146 * maximum number of CRs or LFs found in a row).
147 *
148 * @param input the input string
149 *
150 * @return the canonicalised string, as described above
151 */
152 public static String reduceNewlines(String input) {
153 StringBuilder sb = new StringBuilder();
154 int len = input.length();
155 int crCount = 0;
156 int lfCount = 0;
157 boolean insertNewline = false;
158 char ch;
159 for (int i=0; i<len; i++) {
160 ch = input.charAt(i);
161 if (ch == (char) 0xA) {
162 lfCount ++; insertNewline = true;
163 } else if (ch == (char) 0xD) {
164 crCount ++; insertNewline = true;
165 } else if (insertNewline) {
166 for (int j=0; j<Math.max(lfCount, crCount); j++) {
167 sb.append((char) 0xA);
168 }
169 insertNewline = false; lfCount=0; crCount=0;
170 sb.append(ch);
171 } else {
172 sb.append(ch);
173 }
174 }
175 if (insertNewline) {
176 for (int j=0; j<Math.max(lfCount, crCount); j++) {
177 sb.append((char) 0xA);
178 }
179 }
180
181 return sb.toString();
182 }
183
184
185 /**
186 * Returns the HTML-escaped form of a string. The <code>&</code>,
187 * <code><</code>, <code>></code>, and <code>"</code> characters are converted to
188 * <code>&amp;</code>, <code>&lt;</code>, <code>&gt;</code>, and
189 * <code>&quot;</code> respectively.
190 *
191 * <p>Characters in the unicode control code blocks ( apart from \t, \n and \r ) are converted to &xfffd;
192 * <p>Characters outside of the ASCII printable range are converted into &xnnnn; form
193 *
194 * @param string the string to convert
195 *
196 * @return the HTML-escaped form of the string
197 */
198 static public String escapeHtml(String string) {
199 if (string == null) {
200 return "";
201 }
202 char c;
203 String hex;
204 StringBuilder sb = new StringBuilder(string.length());
205 for (int i = 0; i < string.length(); i++) {
206 c = string.charAt(i);
207 // check for illegal characters
208 switch (c) {
209 case '&':
210 sb.append("&");
211 break;
212 case '<':
213 sb.append("<");
214 break;
215 case '>':
216 sb.append(">");
217 break;
218 case '\"':
219 // interestingly, "e; (with the e) works fine for HTML display,
220 // but not inside hidden field values
221 sb.append(""");
222 break;
223 default:
224 // 'illegal characters' according to ESAPI. 7f to 9f are control characters in unicode
225 if ( ( c <= 0x1f && c != '\t' && c != '\n' && c != '\r' ) || ( c >= 0x7f && c <= 0x9f ) ) {
226 sb.append("�"); // REPLACEMENT_HEX in ESAPI's HtmlEntityCodec
227 } else if ( c > 0x1f && c <= 0x7f ) {
228 // safe printable
229 sb.append(c);
230 } else {
231 // ESAPI didn't have the else block above, which was causing it escape everything
232 hex = getHexForNonAlphanumeric(c);
233 sb.append("&#x" + hex + ";");
234 }
235
236 }
237 }
238
239 return sb.toString();
240 }
241
242 /**
243 * Returns a regex-escaped form of a string. That is, the pattern
244 * returned by this method, if compiled into a regex, will match
245 * the supplied string exactly.
246 *
247 * @param string the string to convert
248 *
249 * @return the HTML-escaped form of the string
250 */
251 static public String escapeRegex(String string) {
252 if (string == null) {
253 return "";
254 }
255
256 char c;
257 StringBuilder sb = new StringBuilder(string.length());
258
259 for (int i = 0; i < string.length(); i++) {
260 c = string.charAt(i);
261
262 switch (c) {
263 case '.':
264 case '+': // intentional fall-through
265 case '?': // intentional fall-through
266 case '\\': // intentional fall-through
267 case '{': // intentional fall-through
268 case '}': // intentional fall-through
269 case '[': // intentional fall-through
270 case ']': // intentional fall-through
271 case '^': // intentional fall-through
272 case '$': // intentional fall-through
273 case '(': // intentional fall-through
274 case '|': // intentional fall-through
275 case ')': // intentional fall-through
276 sb.append("\\");
277 sb.append(c);
278 break;
279 default:
280 sb.append(c);
281 }
282 }
283
284 return sb.toString();
285 }
286
287
288 /**
289 * Returns the csv-escaped form of a string. A csv-escaped string is
290 * used when writing to a CSV (comma-separated-value) file. It ensures
291 * that commas included within a string are quoted. We use the Microsoft-Excel
292 * quoting rules, so that our CSV files can be imported into that. These rules
293 * (derived from experimentation) are:
294 *
295 * <ul>
296 * <li>Strings without commas (,) inverted commas ("), or newlines (\n) are returned as-is.
297 * <li>Otherwise, the string is surrounded by inverted commas, and any
298 * inverted commas within the string are doubled-up (i.e. '"' becomes '""').
299 * <li>A value that starts with any of "=", "@", "+" or "-" has a leading single apostrophe added
300 * to prevent the value being evaluated in Excel. The leading quote is visible to the user when the
301 * csv is opened, which may mean that it will have to be removed when roundtripping data.
302 * This may complicate things if the user actually wants a leading single quote in their CSV value.
303 * </ul>
304 *
305 * <p>Embedded newlines are inserted as-is, as per Excel. This will require
306 * some care whilst parsing if we want to be able to read these files.
307 *
308 * @param string the string to convert
309 *
310 * @return the csv-escaped form of the string
311 */
312 static public String escapeCsv(String string) {
313 if (string == null) {
314 return "";
315 }
316
317 boolean quoted = false;
318 // from https://www.contextis.com/en/blog/comma-separated-vulnerabilities
319 // prefix cells that start with ‘=’ , '@', '+' or '-' with an apostrophe
320 // This will ensure that the cell isn’t interpreted as a formula, and as a bonus in Microsoft Excel the apostrophe itself will not be displayed.
321 if (string.startsWith("=") ||
322 string.startsWith("@")) {
323 // prefix the string with an a single quote char to escape it
324 string = "'" + string;
325 quoted = true; // not sure need to quote here, but doesn't hurt
326 } else if ((string.startsWith("+") || string.startsWith("-")) &&
327 (string.length() == 1 || !Text.isNumericDecimalExp(string))) {
328 // numbers can legitimately start with '+' or '-' but anything else should be escaped
329 string = "'" + string;
330 quoted = true;
331 }
332
333
334 if (string.indexOf(',') == -1 && string.indexOf('"') == -1 && string.indexOf('\n') == -1 && !quoted) {
335 return string;
336 }
337 string = Text.replaceString(string, "\"", "\"\"");
338 string = "\"" + string + "\"";
339
340 return string;
341 }
342
343 /** Given a csv-encoded string (as produced by the rules in {@link #escapeCsv(String)},
344 * produces a List of Strings which represent the individual values in the string.
345 * Note that this method is *not* equivalent to calling <code>Arrays.asList(astring.split(","))</code>.
346 *
347 * <p>Setting the whitespaceSensitive parameter to false allows leading and trailing
348 * whitespace in *non-quoted* values to be removed, e.g. if the input string <code>text</code> is:
349 *
350 * <pre class="code">
351 * abc,def, ghi, j k ,"lmn"," op "," q,r","""hello""", "another"
352 * </pre>
353 *
354 * then <code>parseCsv(text, <b>false</b>)</code> will return the strings:
355 * <pre class="code">
356 * abc
357 * def
358 * ghi
359 * j k
360 * lmn
361 * op <i>(this String has one leading space, and a trailing space after 'p')</i>
362 * q,r <i>(this String has one leading space)</i>
363 * "hello"
364 * another
365 * </pre>
366 *
367 * and <code>parseCsv(text, <b>true</b>)</code> would throw a ParseException (since the
368 * final element is a quoted value, but begins with a space).
369 *
370 * If the <code>, "another"</code> text is removed, however, then
371 * <code>parseCsv(text, true)</code> would return the following:
372 *
373 * and <code>parseCsv(text, true)</code> will return the string
374 * <pre>
375 * abc
376 * def
377 * ghi <i>(this String has two leading spaces)</i>
378 * j k <i>(this String has one leading space and a trailing space after the 'k' character)</i>
379 * lmn
380 * op <i>(this String has one leading space, and a trailing space after 'p')</i>
381 * q,r <i>(this String has one leading space)</i>
382 * "hello"
383 * </pre>
384 *
385 * <p>Most applications would want to use the 'whiteSpaceSensitive=false' form of this function, since
386 * (a) less chance of a ParseException, and (b) it's what an end-user would normally
387 * expect. This can be performed by calling the {@link #parseCsv(String)} method.
388 *
389 * <p>Whitespace is determined by using the <code>Character.isSpaceChar()</code> method,
390 * which is Unicode-aware.
391 *
392 * @param text The CSV-encoded string to parse
393 * @param whitespaceSensitive If set to true, will trim leading and trailing whitespace in *non-quoted* values.
394 *
395 * @return a List of Strings. The returned List is guaranteed to always contain at least one element.
396 *
397 * @throws NullPointerException if the text passed to this method is null
398 * @throws ParseException if a quoted value contains leading whitespace before the
399 * opening quote, or after the trailing quote.
400 * @throws ParseException if a quoted value has a start quote, but no end quote, or
401 * if a value has additional text after a quoted value (before the next comma or EOL).
402 */
403 static public List<String> parseCsv(String text, boolean whitespaceSensitive)
404 throws ParseException {
405 if (text == null) {
406 throw new NullPointerException("null text");
407 }
408
409 // parse state:
410 // 0=searching for new value (at start of line or after comma)
411 // 1=consuming non-quoted values
412 // 2=consuming quoted value
413 // 3=consumed first quote within a quoted value (may be termining quote or a "" sequence)
414 // 4=consuming whitespace up to next comma/EOL (after quoted value, not whitespaceSensitive)
415 int parseState = 0;
416 int length = text.length();
417 String element;
418 List<String> result = new ArrayList<String>();
419 char ch;
420 StringBuilder buffer = new StringBuilder();
421
422 for (int pos = 0; pos < length; pos++) {
423 ch = text.charAt(pos);
424
425 // System.out.println("pos " + pos + ", state=" + parseState + ", nextchar=" + ch + ", buf=" + buffer);
426 switch (parseState) {
427 case 0:
428 if (Character.isSpaceChar(ch)) {
429 if (whitespaceSensitive) {
430 buffer.append(ch);
431 parseState = 1;
432 } else {
433 // ignore
434 }
435 } else if (ch == '"') {
436 parseState = 2;
437 } else if (ch == ',') {
438 result.add(""); // add an empty element; state remains unchanged
439 } else {
440 buffer.append(ch);
441 parseState = 1;
442 }
443 break;
444 case 1:
445 if (ch == ',') {
446 element = buffer.toString();
447 if (!whitespaceSensitive) {
448 element = element.trim();
449 }
450 result.add(element);
451 buffer.setLength(0);
452 parseState = 0;
453 } else {
454 buffer.append(ch);
455 }
456 break;
457 case 2:
458 if (ch == '"') {
459 parseState = 3;
460 } else {
461 buffer.append(ch);
462 }
463 break;
464 case 3:
465 if (ch == '"') {
466 buffer.append('"');
467 parseState = 2;
468 } else if (ch == ',') {
469 result.add(buffer.toString());
470 buffer.setLength(0);
471 parseState = 0;
472 } else if (Character.isSpaceChar(ch)) {
473 if (whitespaceSensitive) {
474 throw new ParseException("Cannot have trailing whitespace after close quote character", pos);
475 }
476 parseState = 4;
477 } else {
478 throw new ParseException("Cannot have trailing data after close quote character", pos);
479 }
480 break;
481 case 4:
482 if (Character.isSpaceChar(ch)) {
483 // consume and ignore
484 } else if (ch == ',') {
485 result.add(buffer.toString());
486 buffer.setLength(0);
487 parseState = 0;
488 } else {
489 throw new ParseException("Cannot have trailing data after close quote character", pos);
490 }
491 break;
492
493 default:
494 throw new IllegalStateException("Illegal state '" + parseState + "' in parseCsv");
495 }
496 }
497
498 // if state is 2, we are in the middle of a quoted value
499 if (parseState == 2) {
500 throw new ParseException("Missing endquote in csv text", length);
501 }
502
503 // otherwise we still need to add what's left in the buffer into the result list
504 element = buffer.toString();
505 if (parseState == 1 && !whitespaceSensitive) {
506 element = element.trim();
507 }
508 result.add(element);
509 return result;
510 }
511
512 @FunctionalInterface
513 public interface CsvLineReader { // doesn't extend Supplier<T> as it throws exceptions
514 /** Returns the next logical line in the CSV ( quoted values can contain newlines )
515 *
516 * @return
517 * @throws ParseException
518 * @throws IOException
519 */
520 List<String> readLine() throws ParseException, IOException;
521 }
522
523 // same as parseCsv(String, whitespaceSensitive) but can handle newlines in quotes by supplying a Reader
524 // the returned object will return a List<String> or null if EOF is reached
525 // ParseExceptions are wrapped in something, probably
526 static public CsvLineReader parseCsv(Reader r, boolean whitespaceSensitive) {
527 if (r == null) {
528 throw new NullPointerException("null reader");
529 }
530 return new CsvLineReader() {
531 // eof if we actually read eof or encouner a parse exception ( cannot recover )
532 boolean isAtStart = true; // for backwards compatibility with Text.parseCsv(""), first readLine() is never null
533 boolean isEOF = false;
534 @Override
535 public List<String> readLine() throws ParseException, IOException {
536 if (isEOF) { return null; }
537
538 // parse state:
539 // 0=searching for new value (at start of line or after comma)
540 // 1=consuming non-quoted values
541 // 2=consuming quoted value
542 // 3=consumed first quote within a quoted value (may be termining quote or a "" sequence)
543 // 4=consuming whitespace up to next comma/EOL (after quoted value, not whitespaceSensitive)
544 int parseState = 0;
545 // int length = text.length();
546 String element;
547 List<String> result = new ArrayList<String>();
548 char ch;
549 StringBuilder buffer = new StringBuilder();
550 int intChar = r.read();
551 int pos = 1;
552 if (intChar == -1 && !isAtStart) {
553 isEOF = true;
554 return null;
555 }
556
557 // @TODO better CRLF handling
558 isAtStart = false;
559 while (intChar != -1) {
560 ch = (char) intChar;
561
562 // System.out.println("pos " + pos + ", state=" + parseState + ", nextchar=" + ch + ", buf=" + buffer);
563 switch (parseState) {
564 case 0:
565 if (ch == '\n') {
566 // return result so far
567 element = buffer.toString();
568 result.add(buffer.toString());
569 return result;
570 } else if (Character.isSpaceChar(ch)) {
571 if (whitespaceSensitive) {
572 buffer.append(ch);
573 parseState = 1;
574 } else {
575 // ignore
576 }
577 } else if (ch == '"') {
578 parseState = 2;
579 } else if (ch == ',') {
580 result.add(""); // add an empty element; state remains unchanged
581 } else {
582 buffer.append(ch);
583 parseState = 1;
584 }
585 break;
586 case 1:
587 if (ch == '\n') {
588 // return result so far
589 element = buffer.toString();
590 if (!whitespaceSensitive) {
591 element = element.trim();
592 }
593 result.add(buffer.toString());
594 return result;
595 } else if (ch == ',') {
596 element = buffer.toString();
597 if (!whitespaceSensitive) {
598 element = element.trim();
599 }
600 result.add(element);
601 buffer.setLength(0);
602 parseState = 0;
603 } else {
604 buffer.append(ch);
605 }
606 break;
607 case 2:
608 if (ch == '"') {
609 parseState = 3;
610 } else {
611 buffer.append(ch);
612 }
613 break;
614 case 3:
615 if (ch == '\n') {
616 result.add(buffer.toString());
617 buffer.setLength(0);
618 parseState = 0;
619 return result;
620 } else if (ch == '"') {
621 buffer.append('"');
622 parseState = 2;
623 } else if (ch == ',') {
624 result.add(buffer.toString());
625 buffer.setLength(0);
626 parseState = 0;
627 } else if (Character.isSpaceChar(ch)) {
628 if (whitespaceSensitive) {
629 isEOF = true;
630 throw new ParseException("Cannot have trailing whitespace after close quote character", pos);
631 }
632 parseState = 4;
633 } else {
634 isEOF = true;
635 throw new ParseException("Cannot have trailing data after close quote character", pos);
636 }
637 break;
638 case 4:
639 if (ch == '\n') {
640 // return result so far
641 result.add(buffer.toString());
642 return result;
643 } else if (Character.isSpaceChar(ch)) {
644 // consume and ignore
645 } else if (ch == ',') {
646 result.add(buffer.toString());
647 buffer.setLength(0);
648 parseState = 0;
649 } else {
650 isEOF = true;
651 throw new ParseException("Cannot have trailing data after close quote character", pos);
652 }
653 break;
654
655 default:
656 throw new IllegalStateException("Illegal state '" + parseState + "' in parseCsv");
657 }
658
659 intChar = r.read();
660 pos++;
661 }
662 isEOF = true;
663
664 // if state is 2, we are in the middle of a quoted value
665 if (parseState == 2) {
666 throw new ParseException("Missing endquote in csv text", pos);
667 }
668
669 // otherwise we still need to add what's left in the buffer into the result list
670 element = buffer.toString();
671 if (parseState == 1 && !whitespaceSensitive) {
672 element = element.trim();
673 }
674 result.add(element);
675 return result;
676 }
677 };
678 }
679
680 /**
681 * Equivalent to <code>parseCsv(text, false);</code> (i.e. whitespace-insensitive parsing).
682 * Refer to the documentation for that method for more details.
683 *
684 * @see #parseCsv(String, boolean)
685 *
686 * @param text he CSV-encoded string to parse
687 *
688 * @return a List of Strings. The returned List is guaranteed to always contain at least one element.
689 *
690 * @throws NullPointerException if the text passed to this method is null.
691 * @throws ParseException see {@link #parseCsv(String, boolean)} for details.
692 */
693 static public List<String> parseCsv(String text)
694 throws ParseException {
695 return Text.parseCsv(text, false);
696 }
697
698 /** Returns a java-escaped string. Replaces '"' with '\"'.
699 *
700 * <p>Since this is predominantly used in the query builder, I am not worrying about
701 * unicode sequences (SWIFT is ASCII) or newlines (although this may be necessary later)
702 * for multiline textboxes
703 *
704 * @return The java-escaped version of the string
705 */
706 public static String escapeJava(String string) {
707 return Text.replaceString(string, "\"", "\\\"");
708 }
709
710 /** Returns a javascript string. The characters <code>'</code>,
711 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents,
712 *
713 * <p>Non-printable characters are converted into unicode equivalents
714 **
715 * <p>Newlines are now replaced with "\n"
716 *
717 * @return The java-escaped version of the string
718 */
719 public static String escapeJavascript(String string) {
720 // backslashes are always escaped
721 //string = Text.replaceString(string, "\\", "\\u005C");
722 //string = Text.replaceString(string, "\"", "\\u0022");
723 //string = Text.replaceString(string, "'", "\\u0027");
724 //string = Text.replaceString(string, "\n", "\\n");
725 StringBuilder sb = new StringBuilder(string.length());
726 for (int i = 0; i<string.length(); i++) {
727 char ch = string.charAt(i);
728 if (ch=='\n') {
729 sb.append("\\n");
730 } else if (ch=='\\' || ch=='"' || ch=='\'' || ch<32 || ch>126) {
731 String hex = Integer.toString(ch, 16);
732 sb.append("\\u" + "0000".substring(0, 4-hex.length()) + hex);
733 } else {
734 sb.append(ch);
735 }
736 }
737 return scriptPattern.matcher(sb.toString()).replaceAll("\\\\u003C$1");
738 // return sb.toString();
739 }
740
741
742 /** Returns a javascript string. The characters <code>'</code>,
743 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents,
744 *
745 * <p>Non-printable characters are converted into unicode equivalents
746 *
747 * @deprecated use {@link #escapeJavascript(String)} instead
748 *
749 * @return The java-escaped version of the string
750 */
751 public static String escapeJavascript2(String string) {
752 // this method only exists for backwards-compatability
753 string = reduceNewlines(string); // canonicalise CRLFs
754 return escapeJavascript(string);
755 }
756
757
758 /** Unescapes a java-escaped string. Replaces '\"' with '"',
759 * '\\u0022' with '"', '\\u0027' with ''', '\\u005C' with '\'.
760 *
761 * <p>Since this is predominantly used in the query builder, I am not worrying about
762 * unicode sequences (SWIFT is ASCII) or newlines (although this may be necessary later)
763 * for multiline textboxes
764 *
765 * @return The java-escaped version of the string
766 */
767 public static String unescapeJava(String string) {
768 string = Text.replaceString(string, "\\\"", "\"");
769 string = Text.replaceString(string, "\\u0022", "\"");
770 string = Text.replaceString(string, "\\u0027", "'");
771 string = Text.replaceString(string, "\\u005C", "\\");
772 return string;
773 }
774
775 /** Returns a python string, escaped so that it can be enclosed in a single-quoted string.
776 *
777 * <p>The characters <code>'</code>,
778 * <code>"</code> and <code>\</code> are converted into their Unicode equivalents,
779 *
780 * <p>Non-printable characters are converted into unicode equivalents
781 *
782 * @return The python-escaped version of the string
783 */
784 public static String escapePython(String string) {
785 // pretty much the same as Text.escapeJavascript2(), without the reduceNewLines, which probably shouldn't be there anyway
786 string = Text.replaceString(string, "\\", "\\u005C");
787 string = Text.replaceString(string, "\"", "\\u0022");
788 string = Text.replaceString(string, "'", "\\u0027");
789 string = Text.replaceString(string, "\n", "\\n");
790 StringBuilder sb = new StringBuilder(string.length());
791 for (int i = 0; i<string.length(); i++) {
792 char ch = string.charAt(i);
793 if (ch>=32 && ch<=126) {
794 sb.append(ch);
795 } else {
796 String hex = Integer.toString(ch, 16);
797 sb.append("\\u" + "0000".substring(0, 4-hex.length()) + hex);
798 }
799 }
800 return sb.toString();
801 // return string;
802 }
803
804 /** Escape a filename or path component.
805 * Characters that typically have special meanings in paths (":", "/", "\") are escaped with a preceding "\" character.
806 *
807 * Does not escape glob characters ( "*" or "?" ).
808 * Do not use this method to escape a full file path; when escaping a file path, escape each path component separately and then join
809 * the components with "/" characters ( see {@link #createEscapedPath(String[])} ).
810 *
811 * @param string the filename or path component to escape
812 *
813 * @return the escaped form of the filename (or path component)
814 */
815 // Does not escape DOS special filenames ( "NUL", "CON", "LPT1" etc ). Remember those ? Of course you do.
816 public static String escapePathComponent(String string) {
817 string = Text.replaceString(string, "\\", "\\\\");
818 string = Text.replaceString(string, "/", "\\/");
819 string = Text.replaceString(string, ":", "\\:");
820 return string;
821 }
822
823 /** Unescape a filename or path component.
824 * The escape sequences "\\" , "\:" and "\/" are converted to "\", ":" and "/" respectively.
825 * All other escape sequences will raise an IllegalArgumentException
826 *
827 * <p>See {@link #splitEscapedPath(String)} to split an escaped path into components.
828 *
829 * @param pathComponent the filename or path component to unescape
830 *
831 * @return the unescaped form of the filename or path component
832 *
833 * @throws IllegalArgumentException if an unexpected escape is encountered, or the escape is unclosed
834 */
835 public static String unescapePathComponent(String pathComponent) {
836 if (pathComponent == null) {
837 return null;
838 }
839 char c;
840 boolean inEscape = false;
841 StringBuilder sb = new StringBuilder(pathComponent.length());
842 for (int i = 0; i < pathComponent.length(); i++) {
843 c = pathComponent.charAt(i);
844 if (inEscape) {
845 switch (c) {
846 case '\\':
847 case '/': // intentional fall-through
848 case ':': // intentional fall-through
849 sb.append(c);
850 break;
851 default:
852 throw new IllegalArgumentException("Unexpected escape '\\" + c + "' in filename");
853 }
854 inEscape = false;
855 } else {
856 switch (c) {
857 case '\\':
858 inEscape = true;
859 break;
860 default:
861 sb.append(c);
862 }
863 }
864 }
865 if (inEscape) {
866 throw new IllegalArgumentException("Unclosed escape in filename");
867 }
868 return sb.toString();
869 }
870
871 // need to escape the \ in a regex ( \\ ) in a String ( \\\\ )
872 private static Pattern splitPathPattern = Pattern.compile("(?<!\\\\)/");
873
874 /** Split a path, but allow forward slashes in path components if they're escaped by a preceding '\' character.
875 * Individual path components returned by this method will be unescaped.
876 *
877 * <pre>
878 * splitPath(null) = NPE
879 * splitPath("") = [ "" ]
880 * splitPath("abc") = [ "abc" ]
881 * splitPath("abc/def/ghi") = [ "abc", "def", "ghi" ]
882 * splitPath("abc\\/def/ghi") = [ "abc/def", "ghi" ]
883 * </pre>
884 *
885 * <p>Opposite of {@link #createEscapedPath(String[])}
886 */
887 public static String[] splitEscapedPath(String escapedPath) {
888 String[] result = splitPathPattern.split(escapedPath);
889 for (int i=0; i<result.length; i++) {
890 result[i] = Text.unescapePathComponent(result[i]);
891 }
892 return result;
893 }
894
895 /** Escapes the components of a path String, returning an escaped full path String.
896 * Each path component is escaped with {@link #escapePathComponent(String)} and then joined using '/' characters.
897 *
898 * <p>Opposite of {@link #splitEscapedPath(String)}.
899 *
900 * @param pathComponents the filename components
901 * @return an escaped path
902 */
903 public static String createEscapedPath(String[] pathComponents) {
904 String result = null;
905 if (pathComponents.length == 0) {
906 throw new IllegalArgumentException("empty pathComponents");
907 }
908 for (String c : pathComponents) {
909 if (c==null) {
910 throw new NullPointerException("null pathComponent");
911 }
912 if (result == null) {
913 result = escapePathComponent(c);
914 } else {
915 result = result + "/" + escapePathComponent(c);
916 }
917 }
918 return result;
919 }
920
921 // escapeCss from ESAPI 2.0.1
922 private static final String[] esapi_hex = new String[256];
923 static {
924 for ( char c = 0; c < 0xFF; c++ ) {
925 if ( c >= 0x30 && c <= 0x39 || c >= 0x41 && c <= 0x5A || c >= 0x61 && c <= 0x7A ) {
926 esapi_hex[c] = null;
927 } else {
928 esapi_hex[c] = toHex(c).intern();
929 }
930 }
931 }
932 private static String toHex(char c) {
933 return Integer.toHexString(c);
934 }
935 private static String getHexForNonAlphanumeric(char c) {
936 if(c<0xFF) {return esapi_hex[c]; }
937 return toHex(c);
938 }
939 private static String encodeCssCharacter(Character c) {
940 String hex = getHexForNonAlphanumeric(c);
941 if ( hex == null ) { return "" + c; }
942 return "\\" + hex + " ";
943 }
944
945 /**
946 * Returns the CSS-escaped form of a string.
947 *
948 * <p>Characters outside of the printable ASCII range are converted to \nnnn form
949 *
950 * @param input the string to convert
951 *
952 * @return the HTML-escaped form of the string
953 */
954 public static String escapeCss(String input) {
955 if (input == null) { return ""; }
956 StringBuilder sb = new StringBuilder();
957 for (int i = 0; i < input.length(); i++) {
958 char c = input.charAt(i);
959 sb.append(encodeCssCharacter(c));
960 }
961 return sb.toString();
962 }
963
964
965
966
967 /** Returns the given string; but will truncate it to MAX_STRING_OUTPUT_CHARS.
968 * If it exceeds this length, a message is appended expressing how many
969 * characters were truncated. Strings with the key of 'exception' are
970 * not truncated (in order to display full stack traces when these occur).
971 * Any keys that contain the text 'password', 'Password', 'credential' or
972 * 'Credential' will be returned as eight asterisks.
973 *
974 * <p>This method is used in the debug JSP when dumping properties to the user,
975 * in order to prevent inordinately verbose output.
976 *
977 * @param key The key of the string we wish to display
978 * @param string The string value
979 * @return A (possibly truncated) version of this string
980 */
981 public static String getDisplayString(String key, String string) {
982 return getDisplayString(key, string, MAX_STRING_OUTPUT_CHARS);
983 }
984
985 /** Returns the given string; but will truncate it to MAX_STRING_OUTPUT_CHARS.
986 * If it exceeds this length, a message is appended expressing how many
987 * characters were truncated. Strings with the key of 'exception' are
988 * not truncated (in order to display full stack traces when these occur).
989 * Any keys that contain the text 'password', 'Password', 'credential' or
990 * 'Credential' will be returned as eight asterisks.
991 *
992 * <p>This method is used in the debug JSP when dumping properties to the user,
993 * in order to prevent inordinately verbose output.
994 *
995 * @param key The key of the string we wish to display
996 * @param string The string value
997 * @param maxChars The maximum number of characters to display
998 *
999 * @return A (possibly truncated) version of this string
1000 */
1001 public static String getDisplayString(String key, String string, int maxChars) {
1002 if (string == null) {
1003 string = "(null)";
1004 }
1005
1006 if ("exception".equals(key)) {
1007 return string;
1008 }
1009
1010 if (key.indexOf("password") >= 0 || key.indexOf("Password") >= 0 || key.indexOf("credential") >= 0 || key.indexOf("Credential") >= 0) {
1011 return "********";
1012 }
1013
1014 if (string.length() <= maxChars) {
1015 return string;
1016 } else {
1017 return string.substring(0, maxChars) + "... (" + (string.length() - maxChars) + " more characters truncated)";
1018 }
1019 }
1020
1021 /** Utility function to return a default if the supplied string is null.
1022 * Shorthand for <code>(strText==null) ? strDefaultText : strText;</code>
1023 *
1024 * @return strText is strText is not null, otherwise strDefaultText
1025 */
1026 public static String strDefault(String strText, String strDefaultText) {
1027 return (strText == null) ? strDefaultText : strText;
1028 }
1029
1030 /** Return a string composed of a series of strings, separated with the specified delimiter
1031 *
1032 * @param elements The array of elements to join
1033 * @return delimiter The delimiter to join each string with
1034 *
1035 * @throws NullPointerException if elements or delimiter is null
1036 */
1037 public static String join(String[] elements, String delimiter) {
1038 return joinWithLast(elements, false, delimiter, delimiter);
1039 }
1040
1041 /** Return a string composed of a series of strings, separated with the specified delimiter
1042 *
1043 * @param elements A Collection or Iterable of the elements to join
1044 * @return delimiter The delimiter to join each string with
1045 *
1046 * @throws NullPointerException if elements or delimiter is null
1047 */
1048 public static String join(Iterable<?> elements, String delimiter) {
1049 return joinWithLast(elements, false, delimiter, delimiter);
1050 }
1051
1052 /** Return a string composed of a series of strings, separated with the specified delimiter.
1053 * Each element is contained in single quotes. The final delimeter can be set to a different
1054 * value, to produce text in the form <code>"'a', 'b' or 'c'"</code> or <code>"'a', 'b' and 'c'"</code>.
1055 *
1056 * <p>There is no special handling of values containing quotes; see {@link #escapeCsv(String)}
1057 *
1058 * @param elements The array of elements to join
1059 * @param isQuoted If true, each element is surrounded by single quotes
1060 * @param delimiter The delimiter to join each string with
1061 * @param lastDelimiter The delimiter to join the second-last and last elements
1062 *
1063 * @throws NullPointerException if elements or delimiter is null
1064 */
1065 public static String joinWithLast(String[] elements, boolean isQuoted, String delimiter, String lastDelimiter) {
1066 StringBuilder sb = new StringBuilder();
1067 if (elements == null) {
1068 throw new NullPointerException("null elements");
1069 }
1070 if (delimiter == null) {
1071 throw new NullPointerException("null delimiter");
1072 }
1073 if (lastDelimiter == null) {
1074 throw new NullPointerException("null lastDelimiter");
1075 }
1076 int len = elements.length;
1077 if (len == 0) {
1078 return "";
1079 }
1080
1081 for (int i = 0; i < len - 1; i++) {
1082 if (isQuoted) { sb.append("'"); }
1083 sb.append(elements[i]);
1084 if (isQuoted) { sb.append("'"); }
1085 if (i == len - 2) { sb.append(lastDelimiter); } else { sb.append(delimiter); }
1086 }
1087 if (isQuoted) { sb.append("'"); }
1088 sb.append(elements[len - 1]);
1089 if (isQuoted) { sb.append("'"); }
1090 return sb.toString();
1091 }
1092
1093 /** Return a string composed of a series of strings, separated with the specified delimiter
1094 *
1095 * <p>There is no special handling of values containing quotes; see {@link #escapeCsv(String)}
1096 *
1097 * @param elements A Collection or Iterable containing the elements to join
1098 * @param isQuoted If true, each element is surrounded by single quotes
1099 * @param delimiter The delimiter to join each string with
1100 * @param lastDelimiter The delimiter to join the second-last and last elements
1101 *
1102 * @throws NullPointerException if elements or delimiter is null
1103 *
1104 * @see #join(String[], String)
1105 */
1106 public static String joinWithLast(Iterable<?> elements, boolean isQuoted, String delimiter, String lastDelimiter) {
1107 StringBuilder sb = new StringBuilder();
1108 if (elements == null) {
1109 throw new NullPointerException("null elements");
1110 }
1111 if (delimiter == null) {
1112 throw new NullPointerException("null delimiter");
1113 }
1114 if (lastDelimiter == null) {
1115 throw new NullPointerException("null lastDelimiter");
1116 }
1117 Iterator<?> i = elements.iterator();
1118 if (!i.hasNext()) { return ""; }
1119
1120 Object thisEl = i.next();
1121 while (i.hasNext()) {
1122 Object nextEl = i.next();
1123 if (isQuoted) { sb.append("'"); }
1124 sb.append(thisEl);
1125 if (isQuoted) { sb.append("'"); }
1126 if (i.hasNext()) {
1127 sb.append(delimiter);
1128 } else {
1129 sb.append(lastDelimiter);
1130 }
1131 thisEl = nextEl;
1132 }
1133 if (isQuoted) { sb.append("'"); }
1134 sb.append(thisEl);
1135 if (isQuoted) { sb.append("'"); }
1136
1137 return sb.toString();
1138 }
1139
1140
1141
1142
1143 /*
1144 * efficient search & replace ... stolen from Usenet:
1145 * http://groups.google.co.uk/groups?hl=en&lr=&selm=memo.19990629182431.344B%40none.crap
1146 */
1147
1148 /**
1149 * An efficient search & replace routine. Replaces all instances of
1150 * searchString within str with replaceString.
1151 *
1152 * @param originalString The string to search
1153 * @param searchString The string to search for
1154 * @param replaceString The string to replace it with
1155 *
1156 */
1157 public static String replaceString(String originalString, String searchString, String replaceString) {
1158 if (replaceString == null) {
1159 return originalString;
1160 }
1161
1162 if (searchString == null) {
1163 return originalString;
1164 }
1165
1166 if (originalString == null) {
1167 return null;
1168 }
1169
1170 int loc = originalString.indexOf(searchString);
1171
1172 if (loc == -1) {
1173 return originalString;
1174 }
1175
1176 char[] src = originalString.toCharArray();
1177 int n = searchString.length();
1178 int m = originalString.length();
1179 StringBuilder buf = new StringBuilder(m + replaceString.length() - n);
1180 int start = 0;
1181
1182 do {
1183 if (loc > start) {
1184 buf.append(src, start, loc - start);
1185 }
1186
1187 buf.append(replaceString);
1188 start = loc + n;
1189 loc = originalString.indexOf(searchString, start);
1190 } while (loc > 0);
1191
1192 if (start < m) {
1193 buf.append(src, start, m - start);
1194 }
1195
1196 return buf.toString();
1197 }
1198
1199 /**
1200 * Reads a file, and returns its contents in a String
1201 *
1202 * @param filename The file to read
1203 *
1204 * @return The contents of the string,
1205 *
1206 * @throws IOException A problem occurred whilst attempting to read the string
1207 */
1208 public static String getFileContents(String filename)
1209 throws IOException {
1210 File file = new File(filename);
1211 FileInputStream fis = new FileInputStream(file);
1212 byte[] data = new byte[(int) file.length()];
1213 int len = fis.read(data);
1214 fis.close();
1215 if (len < file.length()) {
1216 /* this should never happen -- file has changed underneath us */
1217 throw new IOException("Buffer read != size of file");
1218 }
1219
1220 return new String(data);
1221 }
1222
1223 /**
1224 * Reads a file, and returns its contents in a String. Identical to calling
1225 * <code>getFileContents(projectFile.getCanonicalPath())</code>.
1226 *
1227 * @param file The file to read
1228 *
1229 * @return The contents of the string,
1230 * @throws IOException
1231 *
1232 * @throws IOException A problem occurred whilst attempting to read the string
1233 */
1234 public static String getFileContents(File file) throws IOException {
1235 return getFileContents(file.getCanonicalPath());
1236 }
1237
1238
1239 /**
1240 * Prefixes every lines supplied with a given indent. e.g.
1241 * <code>indent("\t", "abcd\nefgh")</code> would return "\tabcd\n\tefgh". If the
1242 * string ends in a newline, then the return value also ends with a newline.
1243 *
1244 * @param indentString The characters to indent with. Usually spaces or tabs,
1245 * but could be something like a timestamp.
1246 * @param originalString The string to indent.
1247 * @return The originalString, with every line (as separated by the newline
1248 * character) prefixed with indentString.
1249 */
1250 static public String indent(String indentString, String originalString) {
1251 String allButLastChar;
1252 if (originalString == null || indentString == null) {
1253 throw new NullPointerException();
1254 }
1255 if (originalString.equals("")) {
1256 return indentString;
1257 }
1258 allButLastChar = originalString.substring(0, originalString.length() - 1);
1259 return indentString + replaceString(allButLastChar, "\n", "\n" + indentString) + originalString.substring(originalString.length() - 1);
1260 }
1261
1262 /** Ensure that a string is padded with spaces so that it meets the
1263 * required length. If the input string exceeds this length, this it
1264 * is returned unchanged
1265 *
1266 * @param inputString the string to pad
1267 * @param length the desired length
1268 * @param justification a JUSTIFICATION_* constant defining whether left or
1269 * right justification is required.
1270 *
1271 * @return a padded string.
1272 */
1273 static public String pad(String inputString, int length, int justification) {
1274 // @TODO not terribly efficient, but who cares
1275 switch (justification) {
1276 case JUSTIFICATION_LEFT:
1277 while (inputString.length() < length) {
1278 inputString = inputString + " ";
1279 }
1280 break;
1281
1282 case JUSTIFICATION_RIGHT:
1283 while (inputString.length() < length) {
1284 inputString = " " + inputString;
1285 }
1286 break;
1287
1288 case JUSTIFICATION_CENTER:
1289 while (inputString.length() < length) {
1290 inputString = inputString + " ";
1291 if (inputString.length() < length) {
1292 inputString = " " + inputString;
1293 }
1294 }
1295 break;
1296 }
1297 return inputString;
1298 }
1299
1300 /** Given a period-separated list of components (e.g. variable references ("a.b.c") or classnames),
1301 * returns the last component. For example,
1302 * getLastComponent("com.randomnoun.common.util.Text") will return "Text".
1303 *
1304 * <p>If component is null, this function returns null.
1305 * <p>If component contains no periods, this function returns the original string.
1306 *
1307 * @param string The string to retrieve the last component from
1308 */
1309 static public String getLastComponent(String string) {
1310 if (string == null) {
1311 return null;
1312 }
1313 if (string.indexOf('.') == -1) {
1314 return string;
1315 }
1316 return string.substring(string.lastIndexOf('.') + 1);
1317 }
1318
1319 /** Escape this supplied string so it can represent a 'name' or 'value' component
1320 * on a HTTP queryString. This generally involves escaping special characters into %xx
1321 * form. Note that this only works for US-ASCII data.
1322 *
1323 */
1324 public static String escapeQueryString(String unescapedQueryString) {
1325 // default encoding
1326 byte[] data = encodeUrl(allowed_within_query, unescapedQueryString.getBytes());
1327
1328 try {
1329 return new String(data, "US-ASCII");
1330 } catch (UnsupportedEncodingException e) {
1331 throw new RuntimeException("encodeQueryString() requires ASCII support");
1332 }
1333 }
1334
1335 /**
1336 * Encodes an array of bytes into an array of URL safe 7-bit
1337 * characters. Unsafe characters are escaped.
1338 *
1339 * @param urlsafe bitset of characters deemed URL safe
1340 * @param bytes array of bytes to convert to URL safe characters
1341 * @return array of bytes containing URL safe characters
1342 */
1343 private static final byte[] encodeUrl(BitSet urlsafe, byte[] bytes) {
1344 if (bytes == null) {
1345 return null;
1346 }
1347
1348 if (urlsafe == null) {
1349 throw new NullPointerException("null urlsafe");
1350 }
1351
1352 ByteArrayOutputStream buffer = new ByteArrayOutputStream();
1353
1354 for (int i = 0; i < bytes.length; i++) {
1355 int b = bytes[i];
1356
1357 if (b < 0) {
1358 b = 256 + b;
1359 }
1360
1361 if (urlsafe.get(b)) {
1362 if (b == ' ') {
1363 b = '+';
1364 }
1365
1366 buffer.write(b);
1367 } else {
1368 buffer.write('%');
1369
1370 char hex1 = Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16));
1371 char hex2 = Character.toUpperCase(Character.forDigit(b & 0xF, 16));
1372
1373 buffer.write(hex1);
1374 buffer.write(hex2);
1375 }
1376 }
1377
1378 return buffer.toByteArray();
1379 }
1380
1381 /**
1382 * Encodes a string into Base64 format.
1383 * No blanks or line breaks are inserted.
1384 * @param s a String to be encoded.
1385 * @return A String with the Base64 encoded data.
1386 */
1387 public static String encodeBase64(String s) {
1388 return new String(encodeBase64(s.getBytes()));
1389 }
1390
1391 /**
1392 * Encodes a byte array into Base64 format.
1393 * No blanks or line breaks are inserted.
1394 * @param in an array containing the data bytes to be encoded.
1395 * @return A character array with the Base64 encoded data.
1396 */
1397 public static char[] encodeBase64(byte[] in) {
1398 int iLen = in.length;
1399 int oDataLen = (iLen * 4 + 2) / 3; // output length without padding
1400 int oLen = ((iLen + 2) / 3) * 4; // output length including padding
1401 char[] out = new char[oLen];
1402 int ip = 0;
1403 int op = 0;
1404
1405 while (ip < iLen) {
1406 int i0 = in[ip++] & 0xff;
1407 int i1 = ip < iLen ? in[ip++] & 0xff : 0;
1408 int i2 = ip < iLen ? in[ip++] & 0xff : 0;
1409 int o0 = i0 >>> 2;
1410 int o1 = ((i0 & 3) << 4) | (i1 >>> 4);
1411 int o2 = ((i1 & 0xf) << 2) | (i2 >>> 6);
1412 int o3 = i2 & 0x3F;
1413 out[op++] = map1[o0];
1414 out[op++] = map1[o1];
1415 out[op] = op < oDataLen ? map1[o2] : '=';
1416 op++;
1417 out[op] = op < oDataLen ? map1[o3] : '=';
1418 op++;
1419 }
1420 return out;
1421 }
1422
1423 /** Used by {@link #parseData(String) to parse dates generated in Codec output.
1424 * (These dates are generated using the standard Java .toString() method, which
1425 * probably changes depending on the VM's locale, which I'm going to ignore for
1426 * the time being).
1427 */
1428 static class DateParser {
1429
1430 /** Parse a date generated by Date.toString() into a Date object
1431 *
1432 * @param dateString a string representation of a date
1433 * @return a Date representation of a date
1434 */
1435 public static Date valueOf(String dateString) {
1436 SimpleDateFormat sdf = new SimpleDateFormat("EEE MMM dd hh:mm:ss z yyyy");
1437 try {
1438 return sdf.parse(dateString);
1439 } catch (ParseException pe) {
1440 throw (IllegalArgumentException) new IllegalArgumentException("Invalid date '" + dateString + "'").initCause(pe);
1441 }
1442 }
1443 }
1444
1445 // ---------------------- Generous characters for each component validation
1446 // -- not much of this is used in this class, so I should shorten these definitions,
1447 // but you never know, I might use it later, so it's here for the time being.
1448 //
1449 // compiled from
1450 // org.apache.commons.httpclient.util.URIUtil
1451 // org.apache.commons.codec.net.URLCodec
1452 // org.apache.commons.httpclient.util.EncodingUtil
1453 // org.apache.commons.httpclient.URI
1454 //
1455 // trust me... just calling escapeQueryString() is *so* much easier.
1456 private static final BitSet percent = new BitSet(256); // escape % as %25
1457 private static final BitSet digit = new BitSet(256); // 0-9
1458 private static final BitSet alpha = new BitSet(256); // lowalpha | upalpha
1459 private static final BitSet alphanum = new BitSet(256); // alpha | digit
1460 private static final BitSet hex = new BitSet(256); // digit | a-f | A-F
1461 private static final BitSet escaped = new BitSet(256); // "%" hex hex
1462 private static final BitSet mark = new BitSet(256); // -_.!~*'()
1463 private static final BitSet unreserved = new BitSet(256);
1464
1465 // alphanum | mark (URI allowed, no purpose)
1466 private static final BitSet reserved = new BitSet(256); // ;/?:"&=+$,
1467 private static final BitSet uric = new BitSet(256);
1468
1469 // reserved | unreserved | escaped
1470 private static final BitSet allowed_query = new BitSet(256); // uric - %
1471 private static final BitSet allowed_within_query = new BitSet(256);
1472
1473 /** Mapping table from 6-bit nibble to Base64 characters */
1474 private static char[] map1 = new char[64];
1475
1476 // NB: www-form-encoding appears to be alpha | numeric | -_.* ( + space)
1477 static {
1478 percent.set('%');
1479
1480 for (int i = '0'; i <= '9'; i++) {
1481 digit.set(i);
1482 }
1483
1484 for (int i = 'a'; i <= 'z'; i++) {
1485 alpha.set(i);
1486 }
1487
1488 for (int i = 'A'; i <= 'Z'; i++) {
1489 alpha.set(i);
1490 }
1491
1492 alphanum.or(alpha);
1493 alphanum.or(digit);
1494 hex.or(digit);
1495
1496 for (int i = 'a'; i <= 'f'; i++) {
1497 hex.set(i);
1498 }
1499
1500 for (int i = 'A'; i <= 'F'; i++) {
1501 hex.set(i);
1502 }
1503
1504 escaped.or(percent);
1505 escaped.or(hex);
1506 mark.set('-');
1507 mark.set('_');
1508 mark.set('.');
1509 mark.set('!');
1510 mark.set('~');
1511 mark.set('*');
1512 mark.set('\'');
1513 mark.set('(');
1514 mark.set(')');
1515 reserved.set(';');
1516 reserved.set('/');
1517 reserved.set('?');
1518 reserved.set(':');
1519 reserved.set('@');
1520 reserved.set('&');
1521 reserved.set('=');
1522 reserved.set('+');
1523 reserved.set('$');
1524 reserved.set(',');
1525 unreserved.or(alphanum);
1526 unreserved.or(mark);
1527 uric.or(reserved);
1528 uric.or(unreserved);
1529 uric.or(escaped);
1530 allowed_query.or(uric);
1531 allowed_query.clear('%');
1532 allowed_within_query.or(allowed_query);
1533 allowed_within_query.andNot(reserved);
1534
1535
1536 // excluded 'reserved'
1537 // create map1 array
1538 int i = 0;
1539 for (char c = 'A'; c <= 'Z'; c++) {
1540 map1[i++] = c;
1541 }
1542 for (char c = 'a'; c <= 'z'; c++) {
1543 map1[i++] = c;
1544 }
1545 for (char c = '0'; c <= '9'; c++) {
1546 map1[i++] = c;
1547 }
1548 map1[i++] = '+';
1549 map1[i++] = '/';
1550
1551 }
1552
1553
1554
1555 /**
1556 * Returns a comparator that compares contained numbers based on their numeric values and compares other parts
1557 * using the current locale's order rules.
1558 * <p>For example in German locale this will be a comparator that handles umlauts correctly and ignores
1559 * upper/lower case differences.</p>
1560 *
1561 * @return <p>A string comparator that uses the current locale's order rules and handles embedded numbers
1562 * correctly.</p>
1563 */
1564 public static Comparator<String> getNaturalComparator() {
1565 final Collator collator = Collator.getInstance();
1566 return new Comparator<String>() {
1567 public int compare(String o1, String o2) {
1568 return compareNatural(collator, o1, o2);
1569 }
1570 };
1571 }
1572
1573 /**
1574 * <p>Compares two strings using the current locale's rules and comparing contained numbers based on their numeric
1575 * values.</p>
1576 * <p>This is probably the best default comparison to use.</p>
1577 * <p>If you know that the texts to be compared are in a certain language that differs from the default locale's
1578 * langage, then get a collator for the desired locale ({@link java.text.Collator#getInstance(java.util.Locale)})
1579 * and pass it to {@link #compareNatural(java.text.Collator, String, String)}</p>
1580 *
1581 * @param s first string
1582 * @param t second string
1583 * @return zero iff <code>s</code> and <code>t</code> are equal,
1584 * a value less than zero iff <code>s</code> lexicographically precedes <code>t</code>
1585 * and a value larger than zero iff <code>s</code> lexicographically follows <code>t</code>
1586 */
1587 public static int compareNatural(Collator collator, String s, String t) {
1588 return compareNatural(s, t, false, collator);
1589 }
1590
1591
1592 /** Natural compare operation. Stolen from
1593 * http://www.eekboom.com/java/compareNatural/src/com/eekboom/utils/Strings.java
1594 * (source file is under BSD license).
1595 *
1596 * @param s first string
1597 * @param t second string
1598 * @param caseSensitive treat characters differing in case only as equal - will be ignored if a collator is given
1599 * @param collator used to compare subwords that aren't numbers - if null, characters will be compared
1600 * individually based on their Unicode value
1601 * @return zero iff <code>s</code> and <code>t</code> are equal,
1602 * a value less than zero iff <code>s</code> lexicographically precedes <code>t</code>
1603 * and a value larger than zero iff <code>s</code> lexicographically follows <code>t</code>
1604 */
1605 private static int compareNatural(String s, String t, boolean caseSensitive, Collator collator) {
1606 int sIndex = 0;
1607 int tIndex = 0;
1608
1609 int sLength = s.length();
1610 int tLength = t.length();
1611
1612 while(true) {
1613 // both character indices are after a subword (or at zero)
1614
1615 // Check if one string is at end
1616 if(sIndex == sLength && tIndex == tLength) {
1617 return 0;
1618 }
1619 if(sIndex == sLength) {
1620 return -1;
1621 }
1622 if(tIndex == tLength) {
1623 return 1;
1624 }
1625
1626 // Compare sub word
1627 char sChar = s.charAt(sIndex);
1628 char tChar = t.charAt(tIndex);
1629
1630 boolean sCharIsDigit = Character.isDigit(sChar);
1631 boolean tCharIsDigit = Character.isDigit(tChar);
1632
1633 if(sCharIsDigit && tCharIsDigit) {
1634 // Compare numbers
1635
1636 // skip leading 0s
1637 int sLeadingZeroCount = 0;
1638 while(sChar == '0') {
1639 ++sLeadingZeroCount;
1640 ++sIndex;
1641 if(sIndex == sLength) {
1642 break;
1643 }
1644 sChar = s.charAt(sIndex);
1645 }
1646 int tLeadingZeroCount = 0;
1647 while(tChar == '0') {
1648 ++tLeadingZeroCount;
1649 ++tIndex;
1650 if(tIndex == tLength) {
1651 break;
1652 }
1653 tChar = t.charAt(tIndex);
1654 }
1655 boolean sAllZero = sIndex == sLength || !Character.isDigit(sChar);
1656 boolean tAllZero = tIndex == tLength || !Character.isDigit(tChar);
1657 if(sAllZero && tAllZero) {
1658 continue;
1659 }
1660 if(sAllZero && !tAllZero) {
1661 return -1;
1662 }
1663 if(tAllZero) {
1664 return 1;
1665 }
1666
1667 int diff = 0;
1668 do {
1669 if(diff == 0) {
1670 diff = sChar - tChar;
1671 }
1672 ++sIndex;
1673 ++tIndex;
1674 if(sIndex == sLength && tIndex == tLength) {
1675 return diff != 0 ? diff : sLeadingZeroCount - tLeadingZeroCount;
1676 }
1677 if(sIndex == sLength) {
1678 if(diff == 0) {
1679 return -1;
1680 }
1681 return Character.isDigit(t.charAt(tIndex)) ? -1 : diff;
1682 }
1683 if(tIndex == tLength) {
1684 if(diff == 0) {
1685 return 1;
1686 }
1687 return Character.isDigit(s.charAt(sIndex)) ? 1 : diff;
1688 }
1689 sChar = s.charAt(sIndex);
1690 tChar = t.charAt(tIndex);
1691 sCharIsDigit = Character.isDigit(sChar);
1692 tCharIsDigit = Character.isDigit(tChar);
1693 if(!sCharIsDigit && !tCharIsDigit) {
1694 // both number sub words have the same length
1695 if(diff != 0) {
1696 return diff;
1697 }
1698 break;
1699 }
1700 if(!sCharIsDigit) {
1701 return -1;
1702 }
1703 if(!tCharIsDigit) {
1704 return 1;
1705 }
1706 } while(true);
1707 }
1708 else {
1709 // Compare words
1710 if(collator != null) {
1711 // To use the collator the whole subwords have to be compared - character-by-character comparision
1712 // is not possible. So find the two subwords first
1713 int aw = sIndex;
1714 int bw = tIndex;
1715 do {
1716 ++sIndex;
1717 } while(sIndex < sLength && !Character.isDigit(s.charAt(sIndex)));
1718 do {
1719 ++tIndex;
1720 } while(tIndex < tLength && !Character.isDigit(t.charAt(tIndex)));
1721
1722 String as = s.substring(aw, sIndex);
1723 String bs = t.substring(bw, tIndex);
1724 int subwordResult = collator.compare(as, bs);
1725 if(subwordResult != 0) {
1726 return subwordResult;
1727 }
1728 }
1729 else {
1730 // No collator specified. All characters should be ascii only. Compare character-by-character.
1731 do {
1732 if(sChar != tChar) {
1733 if(caseSensitive) {
1734 return sChar - tChar;
1735 }
1736 sChar = Character.toUpperCase(sChar);
1737 tChar = Character.toUpperCase(tChar);
1738 if(sChar != tChar) {
1739 sChar = Character.toLowerCase(sChar);
1740 tChar = Character.toLowerCase(tChar);
1741 if(sChar != tChar) {
1742 return sChar - tChar;
1743 }
1744 }
1745 }
1746 ++sIndex;
1747 ++tIndex;
1748 if(sIndex == sLength && tIndex == tLength) {
1749 return 0;
1750 }
1751 if(sIndex == sLength) {
1752 return -1;
1753 }
1754 if(tIndex == tLength) {
1755 return 1;
1756 }
1757 sChar = s.charAt(sIndex);
1758 tChar = t.charAt(tIndex);
1759 sCharIsDigit = Character.isDigit(sChar);
1760 tCharIsDigit = Character.isDigit(tChar);
1761 } while(!sCharIsDigit && !tCharIsDigit);
1762 }
1763 }
1764 }
1765 }
1766
1767
1768 // taken from the W3C Jigsaw server sourcecode; class org.w3c.jigsaw.http.Request#unescape(String)
1769 /**
1770 * Unescape a HTTP escaped string
1771 * @param s The string to be unescaped
1772 * @return the unescaped string.
1773 */
1774 public static String unescapeQueryString (String s) {
1775 StringBuilder sbuf = new StringBuilder() ;
1776 int len = s.length() ;
1777 int ch = -1 ;
1778 for (int i = 0 ; i < len ; i++) {
1779 switch (ch = s.charAt(i)) {
1780 case '%':
1781 if (i < len - 2) {
1782 // @TODO check to see how illegal escapes are treated
1783 // e.g. "%nothex"
1784 ch = s.charAt (++i) ;
1785 int hb = (Character.isDigit ((char) ch)
1786 ? ch - '0'
1787 : 10+Character.toLowerCase ((char) ch)-'a') & 0xF ;
1788 ch = s.charAt (++i) ;
1789 int lb = (Character.isDigit ((char) ch)
1790 ? ch - '0'
1791 : 10+Character.toLowerCase ((char) ch)-'a') & 0xF ;
1792 sbuf.append ((char) ((hb << 4) | lb)) ;
1793 } else {
1794 sbuf.append ('%'); // hit EOL, just leave as is
1795 }
1796 break ;
1797 case '+':
1798 sbuf.append (' ') ;
1799 break ;
1800 default:
1801 sbuf.append ((char) ch) ;
1802 }
1803 }
1804 return sbuf.toString() ;
1805 }
1806
1807 /** Returns the largest common prefix between two other strings; e.g.
1808 * getCommonPrefix("abcsomething", "abcsometharg") would be "abcsometh".
1809 *
1810 * @param string1 String number one
1811 * @param string2 String number two
1812 *
1813 * @return the large common prefix between the two strings
1814 *
1815 * @throws NullPointerException is string1 or string2 is null
1816 */
1817 public static String getCommonPrefix(String string1, String string2) {
1818 if (string1==null) { throw new NullPointerException("null string1"); }
1819 if (string2==null) { throw new NullPointerException("null string2"); }
1820 int c = 0;
1821 int maxLen = Math.min(string1.length(), string2.length());
1822
1823 while (c < maxLen && string1.charAt(c)==string2.charAt(c)) {
1824 c++;
1825 }
1826 return string1.substring(0, c);
1827 }
1828
1829 /** Uppercases the first character of a string.
1830 *
1831 * @param text text to modify
1832 *
1833 * @return the supplied text, with the first character converted to uppercase.
1834 */
1835 static public String toFirstUpper(String text) {
1836 return Character.toUpperCase(text.charAt(0)) + text.substring(1);
1837 }
1838
1839
1840 /** Lowercases the first character of a string.
1841 *
1842 * @param text text to modify
1843 *
1844 * @return the supplied text, with the first character converted to lowercase.
1845 */
1846 static public String toFirstLower(String text) {
1847 return Character.toLowerCase(text.charAt(0)) + text.substring(1);
1848 }
1849
1850
1851
1852
1853 /** Number of character edits between two strings; taken from
1854 * http://www.merriampark.com/ldjava.htm. There's a version in commongs-lang,
1855 * apparently, but according to the comments on that page, it uses O(n^2) memory,
1856 * which can't be good.
1857 *
1858 * @param s string 1
1859 * @param t string 2
1860 *
1861 * @return the smallest number of edits required to convert s into t
1862 */
1863 public static int getLevenshteinDistance (String s, String t) {
1864 if (s == null || t == null) {
1865 throw new IllegalArgumentException("Strings must not be null");
1866 }
1867
1868 /*
1869 The difference between this impl. and the previous is that, rather
1870 than creating and retaining a matrix of size s.length()+1 by t.length()+1,
1871 we maintain two single-dimensional arrays of length s.length()+1. The first, d,
1872 is the 'current working' distance array that maintains the newest distance cost
1873 counts as we iterate through the characters of String s. Each time we increment
1874 the index of String t we are comparing, d is copied to p, the second int[]. Doing so
1875 allows us to retain the previous cost counts as required by the algorithm (taking
1876 the minimum of the cost count to the left, up one, and diagonally up and to the left
1877 of the current cost count being calculated). (Note that the arrays aren't really
1878 copied anymore, just switched...this is clearly much better than cloning an array
1879 or doing a System.arraycopy() each time through the outer loop.)
1880
1881 Effectively, the difference between the two implementations is this one does not
1882 cause an out of memory condition when calculating the LD over two very large strings.
1883 */
1884
1885 int n = s.length(); // length of s
1886 int m = t.length(); // length of t
1887
1888 if (n == 0) {
1889 return m;
1890 } else if (m == 0) {
1891 return n;
1892 }
1893
1894 int p[] = new int[n+1]; //'previous' cost array, horizontally
1895 int d[] = new int[n+1]; // cost array, horizontally
1896 int _d[]; //placeholder to assist in swapping p and d
1897
1898 // indexes into strings s and t
1899 int i; // iterates through s
1900 int j; // iterates through t
1901
1902 char t_j; // jth character of t
1903
1904 int cost; // cost
1905
1906 for (i = 0; i<=n; i++) {
1907 p[i] = i;
1908 }
1909
1910 for (j = 1; j<=m; j++) {
1911 t_j = t.charAt(j-1);
1912 d[0] = j;
1913
1914 for (i=1; i<=n; i++) {
1915 cost = s.charAt(i-1)==t_j ? 0 : 1;
1916 // minimum of cell to the left+1, to the top+1, diagonally left and up +cost
1917 d[i] = Math.min(Math.min(d[i-1]+1, p[i]+1), p[i-1]+cost);
1918 }
1919
1920 // copy current distance counts to 'previous row' distance counts
1921 _d = p;
1922 p = d;
1923 d = _d;
1924 }
1925
1926 // our last action in the above loop was to switch d and p, so p now
1927 // actually has the most recent cost counts
1928 return p[n];
1929 }
1930
1931 /** Return the md5 hash of a string
1932 *
1933 * @param text text to hash
1934 *
1935 * @return a hex-encoded version of the MD5 hash
1936 *
1937 * @throws IllegalStateException if the java installation in use doesn't know
1938 * about MD5
1939 */
1940 static public String getMD5(String text) {
1941 try{
1942 MessageDigest algorithm = MessageDigest.getInstance("MD5");
1943 algorithm.reset();
1944 // algorithm.update(defaultBytes);
1945 algorithm.update(text.getBytes());
1946 byte messageDigest[] = algorithm.digest();
1947
1948 StringBuilder hexString = new StringBuilder();
1949 for (int i=0;i<messageDigest.length;i++) {
1950 hexString.append(Integer.toHexString(0xFF & messageDigest[i]));
1951 }
1952 return hexString.toString();
1953 } catch (NoSuchAlgorithmException nsae) {
1954 throw (IllegalStateException) new IllegalStateException("Unknown algorithm 'MD5'").initCause(nsae);
1955 }
1956 }
1957
1958 /** Returns a string composed of the supplied text, repeated 0 or more times
1959 *
1960 * @param text text to repeat
1961 * @param count number of repetitions
1962 *
1963 * @return the repeated text
1964 */
1965 static public String repeat(String text, int count) {
1966 StringBuffer sb = new StringBuffer();
1967 for (int i=0; i<count; i++) {
1968 sb.append(text);
1969 }
1970 return sb.toString();
1971 }
1972
1973
1974 /** Perform ${xxxx}-style substitution of placeholders in text. Placeholders without
1975 * values will be left as-is.
1976 *
1977 * <p>For example, gives the set of variables:
1978 * <ul>
1979 * <li>abc = def
1980 * </ul>
1981 *
1982 * <p>then the result of <code>substituteParameters("xxxx${abc}yyyy${def}zzzz")</code>
1983 * will be "xxxxdefyyyy${def}zzzz"
1984 *
1985 * <p><code>$</code> followed by any other character will be left as-is.
1986 *
1987 * @param variables a set of variable names and values, used in the substitution
1988 * @param text the text to be substituted.
1989 *
1990 * @return text, with placeholders replaced with values in the variables parameter
1991 */
1992 public static String substitutePlaceholders(Map<?, ?> variables, String text) {
1993 // escaped version of (\$\{.*?\}|[^$]+|\$.)
1994 Pattern p = Pattern.compile("(\\$\\{.*?\\}|[^$]+|\\$)"); // modified regex
1995 Matcher m = p.matcher(text);
1996 String result = "";
1997 while (m.find()) {
1998 String token = m.group(1);
1999 if (token.startsWith("${") && token.endsWith("}")) {
2000 Object value = variables.get(token.substring(2, token.length()-1));
2001 if (value == null) {
2002 result = result + token;
2003 } else {
2004 result = result + value.toString();
2005 }
2006 } else {
2007 result = result + token;
2008 }
2009 }
2010 return result;
2011 }
2012
2013
2014 }