/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.commons.text; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.ListIterator; import java.util.NoSuchElementException; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; /** * Tokenizes a string based on delimiters (separators) * and supporting quoting and ignored character concepts. *
* This class can split a String into many smaller strings. It aims * to do a similar job to {@link java.util.StringTokenizer StringTokenizer}, * however it offers much more control and flexibility including implementing * the {@code ListIterator} interface. By default, it is set up * like {@code StringTokenizer}. *
* The input String is split into a number of tokens. * Each token is separated from the next String by a delimiter. * One or more delimiter characters must be specified. *
* Each token may be surrounded by quotes. * The quote matcher specifies the quote character(s). * A quote may be escaped within a quoted section by duplicating itself. *
* Between each token and the delimiter are potentially characters that need trimming. * The trimmer matcher specifies these characters. * One usage might be to trim whitespace characters. *
* At any point outside the quotes there might potentially be invalid characters. * The ignored matcher specifies these characters to be removed. * One usage might be to remove new line characters. *
* Empty tokens may be removed or returned as null. *
* "a,b,c" - Three tokens "a","b","c" (comma delimiter) * " a, b , c " - Three tokens "a","b","c" (default CSV processing trims whitespace) * "a, ", b ,", c" - Three tokens "a, " , " b ", ", c" (quoted text untouched) ** *
Property | Type | Default | *
---|---|---|
delim | CharSetMatcher | { \t\n\r\f} | *
quote | NoneMatcher | {} | *
ignore | NoneMatcher | {} | *
emptyTokenAsNull | boolean | false | *
ignoreEmptyTokens | boolean | true | *
* You must call a "reset" method to set the string which you want to parse. *
* @return a new tokenizer instance which parses Comma Separated Value strings */ public static StrTokenizer getCSVInstance() { return getCSVClone(); } /** * Gets a new tokenizer instance which parses Comma Separated Value strings * initializing it with the given input. The default for CSV processing * will be trim whitespace from both ends (which can be overridden with * the setTrimmer method). * * @param input the text to parse * @return a new tokenizer instance which parses Comma Separated Value strings */ public static StrTokenizer getCSVInstance(final char[] input) { final StrTokenizer tok = getCSVClone(); tok.reset(input); return tok; } /** * Gets a new tokenizer instance which parses Comma Separated Value strings * initializing it with the given input. The default for CSV processing * will be trim whitespace from both ends (which can be overridden with * the setTrimmer method). * * @param input the text to parse * @return a new tokenizer instance which parses Comma Separated Value strings */ public static StrTokenizer getCSVInstance(final String input) { final StrTokenizer tok = getCSVClone(); tok.reset(input); return tok; } /** * Returns a clone of {@code TSV_TOKENIZER_PROTOTYPE}. * * @return a clone of {@code TSV_TOKENIZER_PROTOTYPE}. */ private static StrTokenizer getTSVClone() { return (StrTokenizer) TSV_TOKENIZER_PROTOTYPE.clone(); } /** * Gets a new tokenizer instance which parses Tab Separated Value strings. * The default for CSV processing will be trim whitespace from both ends * (which can be overridden with the setTrimmer method). ** You must call a "reset" method to set the string which you want to parse. *
* @return a new tokenizer instance which parses Tab Separated Value strings. */ public static StrTokenizer getTSVInstance() { return getTSVClone(); } /** * Gets a new tokenizer instance which parses Tab Separated Value strings. * The default for CSV processing will be trim whitespace from both ends * (which can be overridden with the setTrimmer method). * @param input the string to parse * @return a new tokenizer instance which parses Tab Separated Value strings. */ public static StrTokenizer getTSVInstance(final char[] input) { final StrTokenizer tok = getTSVClone(); tok.reset(input); return tok; } /** * Gets a new tokenizer instance which parses Tab Separated Value strings. * The default for CSV processing will be trim whitespace from both ends * (which can be overridden with the setTrimmer method). * @param input the string to parse * @return a new tokenizer instance which parses Tab Separated Value strings. */ public static StrTokenizer getTSVInstance(final String input) { final StrTokenizer tok = getTSVClone(); tok.reset(input); return tok; } /** The text to work on. */ private char[] chars; /** The parsed tokens. */ private String[] tokens; /** The current iteration position. */ private int tokenPos; /** The delimiter matcher. */ private StrMatcher delimMatcher = StrMatcher.splitMatcher(); /** The quote matcher. */ private StrMatcher quoteMatcher = StrMatcher.noneMatcher(); /** The ignored matcher. */ private StrMatcher ignoredMatcher = StrMatcher.noneMatcher(); /** The trimmer matcher. */ private StrMatcher trimmerMatcher = StrMatcher.noneMatcher(); /** Whether to return empty tokens as null. */ private boolean emptyAsNull; /** Whether to ignore empty tokens. */ private boolean ignoreEmptyTokens = true; /** * Constructs a tokenizer splitting on space, tab, newline and form feed * as per StringTokenizer, but with no text to tokenize. ** This constructor is normally used with {@link #reset(String)}. *
*/ public StrTokenizer() { this.chars = null; } /** * Constructs a tokenizer splitting on space, tab, newline and form feed * as per StringTokenizer. * * @param input the string which is to be parsed, not cloned */ public StrTokenizer(final char[] input) { if (input == null) { this.chars = null; } else { this.chars = input.clone(); } } /** * Constructs a tokenizer splitting on the specified character. * * @param input the string which is to be parsed, not cloned * @param delim the field delimiter character */ public StrTokenizer(final char[] input, final char delim) { this(input); setDelimiterChar(delim); } /** * Constructs a tokenizer splitting on the specified delimiter character * and handling quotes using the specified quote character. * * @param input the string which is to be parsed, not cloned * @param delim the field delimiter character * @param quote the field quoted string character */ public StrTokenizer(final char[] input, final char delim, final char quote) { this(input, delim); setQuoteChar(quote); } /** * Constructs a tokenizer splitting on the specified string. * * @param input the string which is to be parsed, not cloned * @param delim the field delimiter string */ public StrTokenizer(final char[] input, final String delim) { this(input); setDelimiterString(delim); } /** * Constructs a tokenizer splitting using the specified delimiter matcher. * * @param input the string which is to be parsed, not cloned * @param delim the field delimiter matcher */ public StrTokenizer(final char[] input, final StrMatcher delim) { this(input); setDelimiterMatcher(delim); } /** * Constructs a tokenizer splitting using the specified delimiter matcher * and handling quotes using the specified quote matcher. * * @param input the string which is to be parsed, not cloned * @param delim the field delimiter character * @param quote the field quoted string character */ public StrTokenizer(final char[] input, final StrMatcher delim, final StrMatcher quote) { this(input, delim); setQuoteMatcher(quote); } /** * Constructs a tokenizer splitting on space, tab, newline and form feed * as per StringTokenizer. * * @param input the string which is to be parsed */ public StrTokenizer(final String input) { if (input != null) { chars = input.toCharArray(); } else { chars = null; } } /** * Constructs a tokenizer splitting on the specified delimiter character. * * @param input the string which is to be parsed * @param delim the field delimiter character */ public StrTokenizer(final String input, final char delim) { this(input); setDelimiterChar(delim); } /** * Constructs a tokenizer splitting on the specified delimiter character * and handling quotes using the specified quote character. * * @param input the string which is to be parsed * @param delim the field delimiter character * @param quote the field quoted string character */ public StrTokenizer(final String input, final char delim, final char quote) { this(input, delim); setQuoteChar(quote); } /** * Constructs a tokenizer splitting on the specified delimiter string. * * @param input the string which is to be parsed * @param delim the field delimiter string */ public StrTokenizer(final String input, final String delim) { this(input); setDelimiterString(delim); } /** * Constructs a tokenizer splitting using the specified delimiter matcher. * * @param input the string which is to be parsed * @param delim the field delimiter matcher */ public StrTokenizer(final String input, final StrMatcher delim) { this(input); setDelimiterMatcher(delim); } /** * Constructs a tokenizer splitting using the specified delimiter matcher * and handling quotes using the specified quote matcher. * * @param input the string which is to be parsed * @param delim the field delimiter matcher * @param quote the field quoted string matcher */ public StrTokenizer(final String input, final StrMatcher delim, final StrMatcher quote) { this(input, delim); setQuoteMatcher(quote); } /** * Unsupported ListIterator operation. * @param obj this parameter ignored. * @throws UnsupportedOperationException always */ @Override public void add(final String obj) { throw new UnsupportedOperationException("add() is unsupported"); } /** * Adds a token to a list, paying attention to the parameters we've set. * * @param list the list to add to * @param tok the token to add */ private void addToken(final List* These characters are ignored when parsing the String, unless they are * within a quoted region. * The default value is not to ignore anything. *
* * @return The ignored matcher in use */ public StrMatcher getIgnoredMatcher() { return ignoredMatcher; } /** * Gets the quote matcher currently in use. ** The quote character is used to wrap data between the tokens. * This enables delimiters to be entered as data. * The default value is '"' (double quote). *
* * @return The quote matcher in use */ public StrMatcher getQuoteMatcher() { return quoteMatcher; } /** * Gets a copy of the full token list as an independent modifiable array. * * @return The tokens as a String array */ public String[] getTokenArray() { checkTokenized(); return tokens.clone(); } /** * Gets a copy of the full token list as an independent modifiable list. * * @return The tokens as a String array */ public List* These characters are trimmed off on each side of the delimiter * until the token or quote is found. * The default value is not to trim anything. *
* * @return The trimmer matcher in use */ public StrMatcher getTrimmerMatcher() { return trimmerMatcher; } /** * Checks whether there are any more tokens. * * @return true if there are more tokens */ @Override public boolean hasNext() { checkTokenized(); return tokenPos < tokens.length; } /** * Checks whether there are any previous tokens that can be iterated to. * * @return true if there are previous tokens */ @Override public boolean hasPrevious() { checkTokenized(); return tokenPos > 0; } /** * Gets whether the tokenizer currently returns empty tokens as null. * The default for this property is false. * * @return true if empty tokens are returned as null */ public boolean isEmptyTokenAsNull() { return this.emptyAsNull; } /** * Gets whether the tokenizer currently ignores empty tokens. * The default for this property is true. * * @return true if empty tokens are not returned */ public boolean isIgnoreEmptyTokens() { return ignoreEmptyTokens; } /** * Checks if the characters at the index specified match the quote * already matched in readNextToken(). * * @param srcChars the character array being tokenized * @param pos the position to check for a quote * @param len the length of the character array being tokenized * @param quoteStart the start position of the matched quote, 0 if no quoting * @param quoteLen the length of the matched quote, 0 if no quoting * @return true if a quote is matched */ private boolean isQuote(final char[] srcChars, final int pos, final int len, final int quoteStart, final int quoteLen) { for (int i = 0; i < quoteLen; i++) { if (pos + i >= len || srcChars[pos + i] != srcChars[quoteStart + i]) { return false; } } return true; } /** * Gets the next token. * * @return The next String token * @throws NoSuchElementException if there are no more elements */ @Override public String next() { if (hasNext()) { return tokens[tokenPos++]; } throw new NoSuchElementException(); } /** * Gets the index of the next token to return. * * @return The next token index */ @Override public int nextIndex() { return tokenPos; } /** * Gets the next token from the String. * Equivalent to {@link #next()} except it returns null rather than * throwing {@link NoSuchElementException} when no tokens remain. * * @return The next sequential token, or null when no more tokens are found */ public String nextToken() { if (hasNext()) { return tokens[tokenPos++]; } return null; } /** * Gets the token previous to the last returned token. * * @return The previous token */ @Override public String previous() { if (hasPrevious()) { return tokens[--tokenPos]; } throw new NoSuchElementException(); } /** * Gets the index of the previous token. * * @return The previous token index */ @Override public int previousIndex() { return tokenPos - 1; } /** * Gets the previous token from the String. * * @return The previous sequential token, or null when no more tokens are found */ public String previousToken() { if (hasPrevious()) { return tokens[--tokenPos]; } return null; } /** * Reads character by character through the String to get the next token. * * @param srcChars the character array being tokenized * @param start the first character of field * @param len the length of the character array being tokenized * @param workArea a temporary work area * @param tokenList the list of parsed tokens * @return The starting position of the next field (the character * immediately after the delimiter), or -1 if end of string found */ private int readNextToken(final char[] srcChars, int start, final int len, final StrBuilder workArea, final List* This method allows the same tokenizer to be reused for the same String. * * @return this, to enable chaining */ public StrTokenizer reset() { tokenPos = 0; tokens = null; return this; } /** * Reset this tokenizer, giving it a new input string to parse. * In this manner you can re-use a tokenizer with the same settings * on multiple input lines. * * @param input the new character array to tokenize, not cloned, null sets no text to parse * @return this, to enable chaining */ public StrTokenizer reset(final char[] input) { reset(); if (input != null) { this.chars = input.clone(); } else { this.chars = null; } return this; } /** * Reset this tokenizer, giving it a new input string to parse. * In this manner you can re-use a tokenizer with the same settings * on multiple input lines. * * @param input the new string to tokenize, null sets no text to parse * @return this, to enable chaining */ public StrTokenizer reset(final String input) { reset(); if (input != null) { this.chars = input.toCharArray(); } else { this.chars = null; } return this; } /** * Unsupported ListIterator operation. * @param obj this parameter ignored. * @throws UnsupportedOperationException always */ @Override public void set(final String obj) { throw new UnsupportedOperationException("set() is unsupported"); } /** * Sets the field delimiter character. * * @param delim the delimiter character to use * @return this, to enable chaining */ public StrTokenizer setDelimiterChar(final char delim) { return setDelimiterMatcher(StrMatcher.charMatcher(delim)); } /** * Sets the field delimiter matcher. *
* The delimiter is used to separate one token from another. *
* * @param delim the delimiter matcher to use * @return this, to enable chaining */ public StrTokenizer setDelimiterMatcher(final StrMatcher delim) { if (delim == null) { this.delimMatcher = StrMatcher.noneMatcher(); } else { this.delimMatcher = delim; } return this; } /** * Sets the field delimiter string. * * @param delim the delimiter string to use * @return this, to enable chaining */ public StrTokenizer setDelimiterString(final String delim) { return setDelimiterMatcher(StrMatcher.stringMatcher(delim)); } /** * Sets whether the tokenizer should return empty tokens as null. * The default for this property is false. * * @param emptyAsNull whether empty tokens are returned as null * @return this, to enable chaining */ public StrTokenizer setEmptyTokenAsNull(final boolean emptyAsNull) { this.emptyAsNull = emptyAsNull; return this; } /** * Set the character to ignore. ** This character is ignored when parsing the String, unless it is * within a quoted region. *
* * @param ignored the ignored character to use * @return this, to enable chaining */ public StrTokenizer setIgnoredChar(final char ignored) { return setIgnoredMatcher(StrMatcher.charMatcher(ignored)); } /** * Set the matcher for characters to ignore. ** These characters are ignored when parsing the String, unless they are * within a quoted region. *
* * @param ignored the ignored matcher to use, null ignored * @return this, to enable chaining */ public StrTokenizer setIgnoredMatcher(final StrMatcher ignored) { if (ignored != null) { this.ignoredMatcher = ignored; } return this; } /** * Sets whether the tokenizer should ignore and not return empty tokens. * The default for this property is true. * * @param ignoreEmptyTokens whether empty tokens are not returned * @return this, to enable chaining */ public StrTokenizer setIgnoreEmptyTokens(final boolean ignoreEmptyTokens) { this.ignoreEmptyTokens = ignoreEmptyTokens; return this; } /** * Sets the quote character to use. ** The quote character is used to wrap data between the tokens. * This enables delimiters to be entered as data. *
* * @param quote the quote character to use * @return this, to enable chaining */ public StrTokenizer setQuoteChar(final char quote) { return setQuoteMatcher(StrMatcher.charMatcher(quote)); } /** * Set the quote matcher to use. ** The quote character is used to wrap data between the tokens. * This enables delimiters to be entered as data. *
* * @param quote the quote matcher to use, null ignored * @return this, to enable chaining */ public StrTokenizer setQuoteMatcher(final StrMatcher quote) { if (quote != null) { this.quoteMatcher = quote; } return this; } /** * Sets the matcher for characters to trim. ** These characters are trimmed off on each side of the delimiter * until the token or quote is found. *
* * @param trimmer the trimmer matcher to use, null ignored * @return this, to enable chaining */ public StrTokenizer setTrimmerMatcher(final StrMatcher trimmer) { if (trimmer != null) { this.trimmerMatcher = trimmer; } return this; } /** * Gets the number of tokens found in the String. * * @return The number of matched tokens */ public int size() { checkTokenized(); return tokens.length; } /** * Internal method to performs the tokenization. ** Most users of this class do not need to call this method. This method * will be called automatically by other (public) methods when required. *
** This method exists to allow subclasses to add code before or after the * tokenization. For example, a subclass could alter the character array, * offset or count to be parsed, or call the tokenizer multiple times on * multiple strings. It is also be possible to filter the results. *
** {@code StrTokenizer} will always pass a zero offset and a count * equal to the length of the array to this method, however a subclass * may pass other values, or even an entirely different array. *
* * @param srcChars the character array being tokenized, may be null * @param offset the start position within the character array, must be valid * @param count the number of characters to tokenize, must be valid * @return The modifiable list of String tokens, unmodifiable if null array or zero count */ protected List