From b2ea6095bf2d893322d9d88b00e57781279c3475 Mon Sep 17 00:00:00 2001 From: Jesse Wilson Date: Mon, 1 Jun 2020 19:42:58 -0400 Subject: [PATCH] New APIs: ByteString.toIndex() and ByteString.toFraction() The first one may be useful with hashing to put byte strings in partitioning buckets for scaling. For example, to divide a dataset into 32 partitions, hash the key then use toIndex(32) to map the key to its partition. The second one may be useful with dynamic experiments and A/B tests. For example, to assign a control group to 5% of customers call toFraction() and put the user in the control group if the result is less than 0.05. --- okio/src/commonMain/kotlin/okio/ByteString.kt | 93 +++++++++++++++++++ .../kotlin/okio/internal/ByteString.kt | 32 +++++++ .../commonTest/kotlin/okio/ByteStringTest.kt | 53 +++++++++++ okio/src/jsMain/kotlin/okio/ByteString.kt | 6 ++ okio/src/jvmMain/kotlin/okio/ByteString.kt | 6 ++ okio/src/nativeMain/kotlin/okio/ByteString.kt | 6 ++ 6 files changed, 196 insertions(+) diff --git a/okio/src/commonMain/kotlin/okio/ByteString.kt b/okio/src/commonMain/kotlin/okio/ByteString.kt index 4e52882701..86f697930f 100644 --- a/okio/src/commonMain/kotlin/okio/ByteString.kt +++ b/okio/src/commonMain/kotlin/okio/ByteString.kt @@ -20,6 +20,7 @@ import kotlin.jvm.JvmField import kotlin.jvm.JvmName import kotlin.jvm.JvmOverloads import kotlin.jvm.JvmStatic +import kotlin.math.min /** * An immutable sequence of bytes. @@ -137,6 +138,98 @@ internal constructor(data: ByteArray) : Comparable { override fun compareTo(other: ByteString): Int + /** + * Projects this value to the range `[0..size)` using linear interpolation. This is equivalent to + * a sorted partitioning of all possible byte strings across [size] equally-sized buckets and + * returning the index of the bucket that this byte string fits in. + * + * For example, the byte string `8000` is the median of all 2-element byte strings, and calling + * `toIndex(100)` on it returns 50. Some other examples: + * + * | Byte String (hex) | `toIndex(100)` | `toIndex(256)` | `toIndex(Int.MAX_VALUE)` | + * | :----------------- | -------------: | -------------: | -----------------------: | + * | (empty) | 0 | 0 | 0 | + * | 00 | 0 | 0 | 0 | + * | 0000 | 0 | 0 | 0 | + * | 000000 | 0 | 0 | 0 | + * | 0000000001 | 0 | 0 | 0 | + * | 00000001 | 0 | 0 | 0 | + * | 00000002 | 0 | 0 | 0 | + * | 00000003 | 0 | 0 | 1 | + * | 01 | 0 | 1 | 8388607 | + * | 02 | 0 | 2 | 16777215 | + * | 03 | 1 | 3 | 25165823 | + * | 80 | 50 | 128 | 1073741823 | + * | 8000 | 50 | 128 | 1073741823 | + * | 80000000 | 50 | 128 | 1073741823 | + * | 81 | 50 | 129 | 1082130431 | + * | 81ffffff | 50 | 129 | 1090519038 | + * | 82 | 50 | 130 | 1090519039 | + * | 83 | 51 | 131 | 1098907647 | + * | ff | 99 | 255 | 2139095039 | + * | ffff | 99 | 255 | 2147450879 | + * | ffffffff | 99 | 255 | 2147483646 | + * | ffffffffffff | 99 | 255 | 2147483646 | + * + * This interprets the bytes in this byte string as **unsigned**. This behavior is consistent with + * [compareTo]. The returned value is also consistent with [compareTo] though the dynamic range + * is compressed. For two byte strings `a` and `b`, if `a < b`, then + * `a.toIndex(n) <= b.toIndex(n)` for all sizes `n`. + * + * This examines at most the first 4 bytes of this byte string. Data beyond the first 4 bytes is + * not used to compute the result. + * + * @param size a positive integer. + * @return a value that is greater than or equal to `0` and less than [size]. + */ + fun toIndex(size: Int): Int + + /** + * Projects this value to the range `[0.0..1.0)` using linear interpolation. This is equivalent to + * sorting all possible byte strings and returning the fraction that precede this byte string. + * + * For example, the byte string `8000` is the median of all 2-element byte strings, and calling + * `toFraction()` on it returns 0.5. Some other examples: + * + * | Byte String (hex) | `toFraction()` | + * | :----------------- | :----------------- | + * | (empty) | 0.0 | + * | 00 | 0.0 | + * | 0000 | 0.0 | + * | 000000 | 0.0 | + * | 00000000000001 | 0.0 | + * | 00000000000007 | 0.0 | + * | 00000000000008 | 0.0000000000000001 | + * | 0000000001 | 0.0000000000009094 | + * | 00000001 | 0.0000000002328306 | + * | 01 | 0.00390625 | + * | 02 | 0.0078125 | + * | 03 | 0.01171875 | + * | 80 | 0.5 | + * | 8000 | 0.5 | + * | 80000000000000 | 0.5 | + * | 81 | 0.50390625 | + * | 81ffffff | 0.5078124997671694 | + * | 82 | 0.5078125 | + * | 83 | 0.51171875 | + * | ff | 0.99609375 | + * | ffff | 0.9999847412109375 | + * | ffffffff | 0.9999999997671694 | + * | ffffffffffff | 0.9999999999999964 | + * | ffffffffffffff | 0.9999999999999999 | + * + * This interprets the bytes in this byte string as **unsigned**. This behavior is consistent with + * [compareTo]. The returned value is also consistent with [compareTo] though the dynamic range + * is compressed. For two byte strings `a` and `b`, if `a < b`, then + * `a.toFraction() <= b.toFraction()`. + * + * This examines at most the first 7 bytes of this byte string. Data beyond the first 7 bytes is + * not used to compute the result. + * + * @return a value that is greater than or equal to `0.0` and less than `1.0`. + */ + fun toFraction(): Double + /** * Returns a human-readable string that describes the contents of this byte string. Typically this * is a string like `[text=Hello]` or `[hex=0000ffff]`. diff --git a/okio/src/commonMain/kotlin/okio/internal/ByteString.kt b/okio/src/commonMain/kotlin/okio/internal/ByteString.kt index 5f0ec021e4..9788c47a59 100644 --- a/okio/src/commonMain/kotlin/okio/internal/ByteString.kt +++ b/okio/src/commonMain/kotlin/okio/internal/ByteString.kt @@ -30,6 +30,7 @@ import okio.isIsoControl import okio.processUtf8CodePoints import okio.shr import okio.toUtf8String +import kotlin.math.min // TODO Kotlin's expect classes can't have default implementations, so platform implementations // have to call these functions. Remove all this nonsense when expect class allow actual code. @@ -248,6 +249,37 @@ internal inline fun ByteString.commonCompareTo(other: ByteString): Int { return if (sizeA < sizeB) -1 else 1 } +@Suppress("NOTHING_TO_INLINE") +internal inline fun ByteString.commonToIndex(size: Int): Int { + require(size > 0) + var numerator = 0L + var denominator = 1L + for (i in 0 until min(4, this.size)) { + numerator = (numerator shl 8) + (get(i) and 0xff) + denominator = (denominator shl 8) + } + return (size * numerator / denominator).toInt() +} + +@Suppress("NOTHING_TO_INLINE") +internal inline fun ByteString.commonToFraction(): Double { + var numerator = 0L + var denominator = 1L + for (i in 0 until min(7, size)) { + numerator = (numerator shl 8) + (get(i) and 0xff) + denominator = (denominator shl 8) + } + + // Double wants 53 bits of precision but we have 56. Discard 3 bits of precision. Without this + // it's possible that this method returns 1.0 for byte strings like "ffffffffffffff". + if (size >= 7) { + numerator = numerator shr 3 + denominator = denominator shr 3 + } + + return numerator.toDouble() / denominator +} + @Suppress("NOTHING_TO_INLINE") internal inline fun commonOf(data: ByteArray) = ByteString(data.copyOf()) diff --git a/okio/src/commonTest/kotlin/okio/ByteStringTest.kt b/okio/src/commonTest/kotlin/okio/ByteStringTest.kt index f287e17fb8..86536afc1a 100644 --- a/okio/src/commonTest/kotlin/okio/ByteStringTest.kt +++ b/okio/src/commonTest/kotlin/okio/ByteStringTest.kt @@ -457,4 +457,57 @@ abstract class AbstractByteStringTest internal constructor( sortedByteStrings.sort() assertEquals(originalByteStrings, sortedByteStrings) } + + @Test fun toIndex() { + assertEquals(0, factory.decodeHex("").toIndex(1)) + assertEquals(0, factory.decodeHex("00").toIndex(1)) + assertEquals(0, factory.decodeHex("ff").toIndex(1)) + assertEquals(0, factory.decodeHex("ffffffff").toIndex(1)) + assertEquals(0, factory.decodeHex("ffffffffffff").toIndex(1)) + + assertEquals(0, factory.decodeHex("").toIndex(100)) + assertEquals(0, factory.decodeHex("00").toIndex(100)) + assertEquals(10, factory.decodeHex("1a").toIndex(100)) + assertEquals(25, factory.decodeHex("40").toIndex(100)) + assertEquals(50, factory.decodeHex("80").toIndex(100)) + assertEquals(75, factory.decodeHex("c0").toIndex(100)) + assertEquals(99, factory.decodeHex("ff").toIndex(100)) + assertEquals(99, factory.decodeHex("ffff").toIndex(100)) + assertEquals(99, factory.decodeHex("ffffff").toIndex(100)) + assertEquals(99, factory.decodeHex("ffffffff").toIndex(100)) + + assertEquals(0, factory.decodeHex("").toIndex(Int.MAX_VALUE)) + assertEquals(0x7f7fffff, factory.decodeHex("ff").toIndex(Int.MAX_VALUE)) + assertEquals(0x7fff7fff, factory.decodeHex("ffff").toIndex(Int.MAX_VALUE)) + assertEquals(0x7fffff7f, factory.decodeHex("ffffff").toIndex(Int.MAX_VALUE)) + assertEquals(0x7ffffffe, factory.decodeHex("ffffffff").toIndex(Int.MAX_VALUE)) + } + + @Test fun toFraction() { + assertEquals(0.0, factory.decodeHex("").toFraction()) + assertEquals(0.0, factory.decodeHex("00").toFraction()) + assertEquals(0.0, factory.decodeHex("00").toFraction()) + assertEquals(0.1015625, factory.decodeHex("1a").toFraction()) + assertEquals(0.25, factory.decodeHex("40").toFraction()) + assertEquals(0.5, factory.decodeHex("80").toFraction()) + assertEquals(0.75, factory.decodeHex("c0").toFraction()) + assertEquals(0.7929493631236255, factory.decodeHex("cafebabe").toFraction()) + assertEquals(0.99609375, factory.decodeHex("ff").toFraction()) + assertEquals(0.9999847412109375, factory.decodeHex("ffff").toFraction()) + assertEquals(0.9999999403953552, factory.decodeHex("ffffff").toFraction()) + assertEquals(0.9999999997671694, factory.decodeHex("ffffffff").toFraction()) + assertEquals(0.9999999999999964, factory.decodeHex("ffffffffffff").toFraction()) + assertEquals(0.9999999999999999, factory.decodeHex("ffffffffffffff").toFraction()) + assertEquals(0.9999999999999999, factory.decodeHex("ffffffffffffffff").toFraction()) + } + + /** Only 5 bits of the 7th byte are used. We use 53 bits in total for IEEE 754 doubles. */ + @Test fun toFractionLast5BitsOf7thByte() { + assertEquals(0.0000000000000000, factory.decodeHex("00000000000007").toFraction()) + assertEquals(1.1102230246251565E-16, factory.decodeHex("00000000000008").toFraction()) + assertEquals(1.1102230246251565E-16, factory.decodeHex("0000000000000f").toFraction()) + assertEquals(2.220446049250313E-16, factory.decodeHex("00000000000010").toFraction()) + assertEquals(0.9999999999999998, factory.decodeHex("fffffffffffff0").toFraction()) + assertEquals(0.9999999999999999, factory.decodeHex("fffffffffffff8").toFraction()) + } } diff --git a/okio/src/jsMain/kotlin/okio/ByteString.kt b/okio/src/jsMain/kotlin/okio/ByteString.kt index 6a360933c2..1a2f3cd4dc 100644 --- a/okio/src/jsMain/kotlin/okio/ByteString.kt +++ b/okio/src/jsMain/kotlin/okio/ByteString.kt @@ -38,6 +38,8 @@ import okio.internal.commonToAsciiLowercase import okio.internal.commonToAsciiUppercase import okio.internal.commonToByteArray import okio.internal.commonToByteString +import okio.internal.commonToFraction +import okio.internal.commonToIndex import okio.internal.commonToString import okio.internal.commonUtf8 import okio.internal.commonWrite @@ -119,6 +121,10 @@ internal actual constructor( actual override fun compareTo(other: ByteString) = commonCompareTo(other) + actual fun toIndex(size: Int) = commonToIndex(size) + + actual fun toFraction() = commonToFraction() + /** * Returns a human-readable string that describes the contents of this byte string. Typically this * is a string like `[text=Hello]` or `[hex=0000ffff]`. diff --git a/okio/src/jvmMain/kotlin/okio/ByteString.kt b/okio/src/jvmMain/kotlin/okio/ByteString.kt index 4b3baad941..c0a9d90786 100644 --- a/okio/src/jvmMain/kotlin/okio/ByteString.kt +++ b/okio/src/jvmMain/kotlin/okio/ByteString.kt @@ -38,6 +38,8 @@ import okio.internal.commonToAsciiLowercase import okio.internal.commonToAsciiUppercase import okio.internal.commonToByteArray import okio.internal.commonToByteString +import okio.internal.commonToFraction +import okio.internal.commonToIndex import okio.internal.commonToString import okio.internal.commonUtf8 import okio.internal.commonWrite @@ -181,6 +183,10 @@ internal actual constructor( actual override fun compareTo(other: ByteString) = commonCompareTo(other) + actual fun toIndex(size: Int) = commonToIndex(size) + + actual fun toFraction() = commonToFraction() + actual override fun toString() = commonToString() @Throws(IOException::class) diff --git a/okio/src/nativeMain/kotlin/okio/ByteString.kt b/okio/src/nativeMain/kotlin/okio/ByteString.kt index d49a41aa26..68a2b251c1 100644 --- a/okio/src/nativeMain/kotlin/okio/ByteString.kt +++ b/okio/src/nativeMain/kotlin/okio/ByteString.kt @@ -39,6 +39,8 @@ import okio.internal.commonToAsciiLowercase import okio.internal.commonToAsciiUppercase import okio.internal.commonToByteArray import okio.internal.commonToByteString +import okio.internal.commonToFraction +import okio.internal.commonToIndex import okio.internal.commonToString import okio.internal.commonUtf8 import okio.internal.commonWrite @@ -125,6 +127,10 @@ internal actual constructor( actual override fun compareTo(other: ByteString) = commonCompareTo(other) + actual fun toIndex(size: Int) = commonToIndex(size) + + actual fun toFraction() = commonToFraction() + /** * Returns a human-readable string that describes the contents of this byte string. Typically this * is a string like `[text=Hello]` or `[hex=0000ffff]`.