diff --git a/core/src/main/java/com/yahoo/ycsb/generator/IncrementingPrintableStringGenerator.java b/core/src/main/java/com/yahoo/ycsb/generator/IncrementingPrintableStringGenerator.java new file mode 100644 index 00000000..82406f02 --- /dev/null +++ b/core/src/main/java/com/yahoo/ycsb/generator/IncrementingPrintableStringGenerator.java @@ -0,0 +1,389 @@ +/** + * Copyright (c) 2016 YCSB contributors. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. See accompanying + * LICENSE file. + */ +package com.yahoo.ycsb.generator; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.NoSuchElementException; +import java.util.Set; + +/** + * A generator that produces strings of {@link #length} using a set of code points + * from {@link #characterSet}. Each time {@link #nextValue()} is executed, the string + * is incremented by one character. Eventually the string may rollover to the beginning + * and the user may choose to have the generator throw a NoSuchElementException at that + * point or continue incrementing. (By default the generator will continue incrementing). + *

+ * For example, if we set a length of 2 characters and the character set includes + * [A, B] then the generator output will be: + *

+ *

+ * This class includes some default character sets to choose from including ASCII + * and plane 0 UTF. + */ +public class IncrementingPrintableStringGenerator extends Generator { + + /** Default string length for the generator. */ + public static final int DEFAULTSTRINGLENGTH = 8; + + /** + * Set of all character types that include every symbol other than non-printable + * control characters. + */ + public static final Set CHAR_TYPES_ALL_BUT_CONTROL; + static { + CHAR_TYPES_ALL_BUT_CONTROL = new HashSet(24); + // numbers + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.DECIMAL_DIGIT_NUMBER); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.LETTER_NUMBER); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.OTHER_NUMBER); + + // letters + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.UPPERCASE_LETTER); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.LOWERCASE_LETTER); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.TITLECASE_LETTER); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.OTHER_LETTER); + + // marks + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.COMBINING_SPACING_MARK); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.NON_SPACING_MARK); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.ENCLOSING_MARK); + + // punctuation + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.CONNECTOR_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.DASH_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.START_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.END_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.INITIAL_QUOTE_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.FINAL_QUOTE_PUNCTUATION); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.OTHER_PUNCTUATION); + + // symbols + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.MATH_SYMBOL); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.CURRENCY_SYMBOL); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.MODIFIER_SYMBOL); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.OTHER_SYMBOL); + + // separators + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.SPACE_SEPARATOR); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.LINE_SEPARATOR); + CHAR_TYPES_ALL_BUT_CONTROL.add((int)Character.PARAGRAPH_SEPARATOR); + } + + /** + * Set of character types including only decimals, upper and lower case letters. + */ + public static final Set CHAR_TYPES_BASIC_ALPHA; + static { + CHAR_TYPES_BASIC_ALPHA = new HashSet(2); + CHAR_TYPES_BASIC_ALPHA.add((int)Character.UPPERCASE_LETTER); + CHAR_TYPES_BASIC_ALPHA.add((int)Character.LOWERCASE_LETTER); + } + + /** + * Set of character types including only decimals, upper and lower case letters. + */ + public static final Set CHAR_TYPES_BASIC_ALPHANUMERICS; + static { + CHAR_TYPES_BASIC_ALPHANUMERICS = new HashSet(3); + CHAR_TYPES_BASIC_ALPHANUMERICS.add((int)Character.DECIMAL_DIGIT_NUMBER); + CHAR_TYPES_BASIC_ALPHANUMERICS.add((int)Character.UPPERCASE_LETTER); + CHAR_TYPES_BASIC_ALPHANUMERICS.add((int)Character.LOWERCASE_LETTER); + } + + /** + * Set of character types including only decimals, letter numbers, + * other numbers, upper, lower, title case as well as letter modifiers + * and other letters. + */ + public static final Set CHAR_TYPE_EXTENDED_ALPHANUMERICS; + static { + CHAR_TYPE_EXTENDED_ALPHANUMERICS = new HashSet(8); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.DECIMAL_DIGIT_NUMBER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.LETTER_NUMBER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.OTHER_NUMBER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.UPPERCASE_LETTER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.LOWERCASE_LETTER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.TITLECASE_LETTER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.MODIFIER_LETTER); + CHAR_TYPE_EXTENDED_ALPHANUMERICS.add((int)Character.OTHER_LETTER); + } + + /** The character set to iterate over. */ + private final int[] characterSet; + + /** An array indices matching a position in the output string. */ + private int[] indices; + + /** The length of the output string in characters. */ + private final int length; + + /** The last value returned by the generator. Should be null if {@link #nextValue()} + * has not been called.*/ + private String lastValue; + + /** Whether or not to throw an exception when the string rolls over. */ + private boolean throwExceptionOnRollover; + + /** Whether or not the generator has rolled over. */ + private boolean hasRolledOver; + + /** + * Generates strings of 8 characters using only the upper and lower case alphabetical + * characters from the ASCII set. + */ + public IncrementingPrintableStringGenerator() { + this(DEFAULTSTRINGLENGTH, printableBasicAlphaASCIISet()); + } + + /** + * Generates strings of {@link #length} characters using only the upper and lower + * case alphabetical characters from the ASCII set. + * @param length The length of string to return from the generator. + * @throws IllegalArgumentException if the length is less than one. + */ + public IncrementingPrintableStringGenerator(final int length) { + this(length, printableBasicAlphaASCIISet()); + } + + /** + * Generates strings of {@link #length} characters using the code points in + * {@link #characterSet}. + * @param length The length of string to return from the generator. + * @param characterSet A set of code points to choose from. Code points in the + * set can be in any order, not necessarily lexical. + * @throws IllegalArgumentException if the length is less than one or the character + * set has fewer than one code points. + */ + public IncrementingPrintableStringGenerator(final int length, final int[] characterSet) { + if (length < 1) { + throw new IllegalArgumentException("Length must be greater than or equal to 1"); + } + if (characterSet == null || characterSet.length < 1) { + throw new IllegalArgumentException("Character set must have at least one character"); + } + this.length = length; + this.characterSet = characterSet; + indices = new int[length]; + } + + @Override + public String nextValue() { + if (hasRolledOver && throwExceptionOnRollover) { + throw new NoSuchElementException("The generator has rolled over to the beginning"); + } + + final StringBuilder buffer = new StringBuilder(length); + for (int i = 0; i < length; i++) { + buffer.append(Character.toChars(characterSet[indices[i]])); + } + + // increment the indices; + for (int i = length - 1; i >= 0; --i) { + if (indices[i] >= characterSet.length - 1) { + indices[i] = 0; + if (i == 0 || characterSet.length == 1 && lastValue != null) { + hasRolledOver = true; + } + } else { + ++indices[i]; + break; + } + } + + lastValue = buffer.toString(); + return lastValue; + } + + @Override + public String lastValue() { + return lastValue; + } + + /** @param exceptionOnRollover Whether or not to throw an exception on rollover. */ + public void setThrowExceptionOnRollover(final boolean exceptionOnRollover) { + this.throwExceptionOnRollover = exceptionOnRollover; + } + + /** @return Whether or not to throw an exception on rollover. */ + public boolean getThrowExceptionOnRollover() { + return throwExceptionOnRollover; + } + + /** + * Returns an array of printable code points with only the upper and lower + * case alphabetical characters from the basic ASCII set. + * @return An array of code points + */ + public static int[] printableBasicAlphaASCIISet() { + final List validCharacters = + generatePrintableCharacterSet(0, 127, null, false, CHAR_TYPES_BASIC_ALPHA); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Returns an array of printable code points with the upper and lower case + * alphabetical characters as well as the numeric values from the basic + * ASCII set. + * @return An array of code points + */ + public static int[] printableBasicAlphaNumericASCIISet() { + final List validCharacters = + generatePrintableCharacterSet(0, 127, null, false, CHAR_TYPES_BASIC_ALPHANUMERICS); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Returns an array of printable code points with the entire basic ASCII table, + * including spaces. Excludes new lines. + * @return An array of code points + */ + public static int[] fullPrintableBasicASCIISet() { + final List validCharacters = + generatePrintableCharacterSet(32, 127, null, false, null); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Returns an array of printable code points with the entire basic ASCII table, + * including spaces and new lines. + * @return An array of code points + */ + public static int[] fullPrintableBasicASCIISetWithNewlines() { + final List validCharacters =new ArrayList(); + validCharacters.add(10); // newline + validCharacters.addAll(generatePrintableCharacterSet(32, 127, null, false, null)); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Returns an array of printable code points the first plane of Unicode characters + * including only the alpha-numeric values. + * @return An array of code points + */ + public static int[] printableAlphaNumericPlaneZeroSet() { + final List validCharacters = + generatePrintableCharacterSet(0, 65535, null, false, CHAR_TYPES_BASIC_ALPHANUMERICS); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Returns an array of printable code points the first plane of Unicode characters + * including all printable characters. + * @return An array of code points + */ + public static int[] fullPrintablePlaneZeroSet() { + final List validCharacters = + generatePrintableCharacterSet(0, 65535, null, false, CHAR_TYPES_ALL_BUT_CONTROL); + final int[] characterSet = new int[validCharacters.size()]; + for (int i = 0; i < validCharacters.size(); i++) { + characterSet[i] = validCharacters.get(i); + } + return characterSet; + } + + /** + * Generates a list of code points based on a range and filters. + * These can be used for generating strings with various ASCII and/or + * Unicode printable character sets for use with DBs that may have + * character limitations. + *

+ * Note that control, surrogate, format, private use and unassigned + * code points are skipped. + * @param startCodePoint The starting code point, inclusive. + * @param lastCodePoint The final code point, inclusive. + * @param characterTypesFilter An optional set of allowable character + * types. See {@link Character} for types. + * @param isFilterAllowableList Determines whether the {@code allowableTypes} + * set is inclusive or exclusive. When true, only those code points that + * appear in the list will be included in the resulting set. Otherwise + * matching code points are excluded. + * @param allowableTypes An optional list of code points for inclusion or + * exclusion. + * @return A list of code points matching the given range and filters. The + * list may be empty but is guaranteed not to be null. + */ + public static List generatePrintableCharacterSet( + final int startCodePoint, + final int lastCodePoint, + final Set characterTypesFilter, + final boolean isFilterAllowableList, + final Set allowableTypes) { + + // since we don't know the final size of the allowable character list we + // start with a list then we'll flatten it to an array. + final List validCharacters = new ArrayList(lastCodePoint); + + for (int codePoint = startCodePoint; codePoint <= lastCodePoint; ++codePoint) { + if (allowableTypes != null && + !allowableTypes.contains(Character.getType(codePoint))) { + continue; + } else { + // skip control points, formats, surrogates, etc + final int type = Character.getType(codePoint); + if (type == Character.CONTROL || + type == Character.SURROGATE || + type == Character.FORMAT || + type == Character.PRIVATE_USE || + type == Character.UNASSIGNED) { + continue; + } + } + + if (characterTypesFilter != null) { + // if the filter is enabled then we need to make sure the code point + // is in the allowable list if it's a whitelist or that the code point + // is NOT in the list if it's a blacklist. + if ((isFilterAllowableList && !characterTypesFilter.contains(codePoint)) || + (characterTypesFilter.contains(codePoint))) { + continue; + } + } + + validCharacters.add(codePoint); + } + return validCharacters; + } + +} diff --git a/core/src/test/java/com/yahoo/ycsb/generator/TestIncrementingPrintableStringGenerator.java b/core/src/test/java/com/yahoo/ycsb/generator/TestIncrementingPrintableStringGenerator.java new file mode 100644 index 00000000..eea3d507 --- /dev/null +++ b/core/src/test/java/com/yahoo/ycsb/generator/TestIncrementingPrintableStringGenerator.java @@ -0,0 +1,130 @@ +/** + * Copyright (c) 2016 YCSB contributors. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you + * may not use this file except in compliance with the License. You + * may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. See the License for the specific language governing + * permissions and limitations under the License. See accompanying + * LICENSE file. + */ +package com.yahoo.ycsb.generator; + +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertNull; +import static org.testng.Assert.fail; + +import java.util.NoSuchElementException; + +import org.testng.annotations.Test; + +public class TestIncrementingPrintableStringGenerator { + private final static int[] ATOC = new int[] { 65, 66, 67 }; + + @Test + public void rolloverOK() throws Exception { + final IncrementingPrintableStringGenerator gen = + new IncrementingPrintableStringGenerator(2, ATOC); + + assertNull(gen.lastValue()); + assertEquals(gen.nextValue(), "AA"); + assertEquals(gen.lastValue(), "AA"); + assertEquals(gen.nextValue(), "AB"); + assertEquals(gen.lastValue(), "AB"); + assertEquals(gen.nextValue(), "AC"); + assertEquals(gen.lastValue(), "AC"); + assertEquals(gen.nextValue(), "BA"); + assertEquals(gen.lastValue(), "BA"); + assertEquals(gen.nextValue(), "BB"); + assertEquals(gen.lastValue(), "BB"); + assertEquals(gen.nextValue(), "BC"); + assertEquals(gen.lastValue(), "BC"); + assertEquals(gen.nextValue(), "CA"); + assertEquals(gen.lastValue(), "CA"); + assertEquals(gen.nextValue(), "CB"); + assertEquals(gen.lastValue(), "CB"); + assertEquals(gen.nextValue(), "CC"); + assertEquals(gen.lastValue(), "CC"); + assertEquals(gen.nextValue(), "AA"); // <-- rollover + assertEquals(gen.lastValue(), "AA"); + } + + @Test + public void rolloverOneCharacterOK() throws Exception { + // It would be silly to create a generator with one character. + final IncrementingPrintableStringGenerator gen = + new IncrementingPrintableStringGenerator(2, new int[] { 65 }); + for (int i = 0; i < 5; i++) { + assertEquals(gen.nextValue(), "AA"); + } + } + + @Test + public void rolloverException() throws Exception { + final IncrementingPrintableStringGenerator gen = + new IncrementingPrintableStringGenerator(2, ATOC); + gen.setThrowExceptionOnRollover(true); + + int i = 0; + try { + while(i < 11) { + ++i; + gen.nextValue(); + } + fail("Expected NoSuchElementException"); + } catch (NoSuchElementException e) { + assertEquals(i, 10); + } + } + + @Test + public void rolloverOneCharacterException() throws Exception { + // It would be silly to create a generator with one character. + final IncrementingPrintableStringGenerator gen = + new IncrementingPrintableStringGenerator(2, new int[] { 65 }); + gen.setThrowExceptionOnRollover(true); + + int i = 0; + try { + while(i < 3) { + ++i; + gen.nextValue(); + } + fail("Expected NoSuchElementException"); + } catch (NoSuchElementException e) { + assertEquals(i, 2); + } + } + + @Test + public void invalidLengths() throws Exception { + try { + new IncrementingPrintableStringGenerator(0, ATOC); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { } + + try { + new IncrementingPrintableStringGenerator(-42, ATOC); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { } + } + + @Test + public void invalidCharacterSets() throws Exception { + try { + new IncrementingPrintableStringGenerator(2, null); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { } + + try { + new IncrementingPrintableStringGenerator(2, new int[] {}); + fail("Expected IllegalArgumentException"); + } catch (IllegalArgumentException e) { } + } +}