Submission 3a217520...
pragma solidity ^0.4.23;
/**
* This file is part of the 1st Solidity Gas Golfing Contest.
*
* Author: Zachary Williamson
*
* This work is licensed under Creative Commons Attribution ShareAlike 3.0.
* https://creativecommons.org/licenses/by-sa/3.0/
*/
contract Unique {
/**
* @dev Removes all but the first occurrence of each element from a list of
* integers, preserving the order of original elements, and returns the list.
*
* The input list may be of any length.
*
* @return The input list, with any duplicate elements removed.
*/
// It's a bloom filter! When 'hmm, probably' is a good substitute for 'definitely'
// ...which in a real-world smart contract seems unlikely. But hey, this is code golf!
// In all seriousness, a bloom filter would serve as a good 'first pass' to validate
// set membership: if a 100% set membership test is expensive the bloom filter will
// limit the number of instances where the expensive test is required.
// A real-world example of this would be if the data to be iterated over is in storage, not
// memory, where sload costs 200 gas. A small bloom filter could be used which would be much cheaper to recall
// NB. My previous evm-optimized entry had a 'minor' bug where I forgot to label
// the uniquify function as 'external'. As a result the calldata was being dumped into
// memory at the indices of my bloom filter, filling it up with junk.
// I thought it was odd that I needed a filter length about 10x larger than I calculated for...
// probability of a hash collision = (1-e^(-kn/m))^k
// k = number of hash functions
// n = number of elements in set
// m = bit-length of bloom filter
// assuming n tops out at around 256 (although...)
// then m = 32*256 bits and k=4 gives probability p of 0.002%
// the value of k seems to be the overriding gas guzzler despite my best efforts
// setting k=1 and having a larger bitfield (0x80 machine words: 4096 bytes, 2^21 bits)
// seems to be the most gas-efficient way of getting an acceptably low p-value
// Variable declarations
// w = value of input array element
// h = 'hash' of w
// i = memory index of bloom filter limb being examined
// b = 256-bit value that we're comparing with bloom filter limb
// r = updated representation of bloom filter limb
// s = current calldata pointer
// l = length of output array in bytes
function uniquify(uint[]) external view returns(uint[]) {
assembly {
// first, let's check there's actually some data to operate on
0x24 calldataload has_data jumpi
0x40 0x00 return
has_data:
0x44
dup1 calldataload
dup2 0x20 add calldataload
test_all_unique:
eq run_is_unique jumpi
jump(has_non_trivial_structure)
run_is_unique:
0x20 add
dup1 calldataload
dup2 0x20 add calldataload
dup1 dup3 or test_all_unique jumpi
mstore(0x00, 0x20)
mstore(0x20, 0x01)
mstore(0x40, calldataload(0x44))
revert(0x00, 0x00)
return(0x00, 0x60)
// if we reach here, all unique
has_non_trivial_structure:
pop
// Push the calldata pointer onto the stack. First array element will be at index 0x44
0x44
// Create the hash table: converts a 8-bit key into a 256-bit
// value. Only one bit is set high and there are 256 unique
// permutations in the lookup table
1 0x0 mstore
2 0x20 mstore
4 0x40 mstore
8 0x60 mstore
16 0x80 mstore
32 0xa0 mstore
64 0xc0 mstore
128 0xe0 mstore
// We want to use 'msize' as our pointer to the next element in our
// output array. It's self-incrementing, so we don't need to call
// '0x20 add' every iteration. It also costs 2 gas, as opposed to
// duplicating a stack-based pointer which costs 3 gas.
// Reducing the stack depth also removes the need for 1 swap op (3 gas),
// as we would otherwise need to increment both the output array pointer
// and the calldata pointer, which requires a swap
// Total gas saving: 10 gas per iteration
// in order to do this, we store data in a word that is one word after the
// reserved bloom filter memory.
// We use the memory from 0x100 to 0x1100 to store our bloom filter,
// which is 128 machine words. Some testing showed we only need 85 words,
// but having a value that is a power of 2 allows for very cheap indexing in our
// main loop, which is worth the extra gas costs of a larger filter
0x01 0x1100 mstore
// ### MAIN LOOP
// We know there's at least one array element, so fall into the loop
loop_start:
// Load current array index
dup1 calldataload // stack state: v
// In order to convert v into a bloom filter key, we need to 'hash' v
// and create a deterministic pseudo-random number.
// Multiplying v by a large prime does the trick. Not very 'random' but it adds
// enough entropy for the filter to work. We use the field-modulus of the
// 254 bit Barreto Naehrig curve 'bn128' as it's big, it's prime and I happen to have it
// lying around my project files
0x30644e72e131a029b85045b68181585d97816a916871ca8d3c208c16d87cfd47 mul // stack state: h s
// N.B. we consume 'v' with the above op. If we need to add 'v' to the set, we
// need to pull it out of calldata again. If the probability of adding an
// element to a set is < 5/8, grabbing it back is cheaper than keeping v
// around, as we would need to pop it off the stack if we don't add v to the set
// Get the memory index of the bloom filter limb for this value
// a logical AND between h and 0xfe0 produces a pseudorandom number
// from 0 to 4,064 that is divisible by 32
// i.e. memory indices for 32 byte words from 0 to 128.
// If the bloom filter range was not a power of 2 we wouldn't be able to
// use a logical AND to pull out a limb index and would need a more expensive mod opcode.
// We add 0x100 to the lookup index to account for our lookup table
dup1 0xfe0 and 0x100 add // stack state: i h s
// Now that we have a filter limb, we need a pseudo random 8-bit value to
// test the filter with. We use the 28th least significant byte of 'h';
// we use the 30th and 31st to construct our filter index and we want a
// low value because our prime is only 254 bits long, so grabbing the most significant
// byte won't give very pseudorandom values for small integers
swap1 28 byte mload // stack state: b i s
// Test if the bit in 'b' is already set in our filter limb. If the bit is set
// we assume that this element is already part of the output set and not unique
// N.B. we need mload(i) again if we add to the set, but it's not kept on the stack
// for the same reasons as 'v'
dup2 mload dup2 and skip_add_to_set jumpi
// ### ADD TO SET
// Alright! The high bit in 'b' wasn't set in our bloom filter, so this
// is a unique value that we should add to output array
// step 1, get 'v' back from calldata and store it at the next free memory index
dup3 calldataload msize mstore
// step 2, update the filter limb to include the bit set in 'b'
dup2 mload or // stack state: r i s
// step 3, store the updated limb
swap1 mstore // stack state: s
// step 4, increase calldata pointer
0x20 add // stack state: s'
// step 5, check if we still have calldata to iterate over. We can test against
// 'calldatasize' (only 2 gas) as we have pre-incremented s.
calldatasize dup2 lt loop_start jumpi
// ### RETURN
// jumpi didn't trigger, so we've finished iterating over our input array
// when returning dynamic byte array, ABI expects following encoding:
// 0x00 - 0x20: relative index of start of array (which is 0x20)
// 0x20 - 0x40: number of array elements
// 0x40 - rest: array
// We know our array data starts at 0x1120, so store '0x20' 2 words beneath that
0x20 0x10e0 mstore // stack state: s
// Get size (in bytes) of array.
0x1120 msize sub // stack state: l s
// Divide size by 0x20 to get number of elements, store at index 0x1100
0x20 dup2 div 0x1100 mstore // stack state: l s
// Add 0x40 to size (in bytes) to get total size of output datam then return
0x40 add 0x10e0 return
// ### SKIP ADD TO SET
// small subroutine if we don't add element to set in our main loop. Placed below
// 'return' code because I want the 'add to set' *jumpi* instruction to fall into
// the return code if the jump condition fails
skip_add_to_set:
// 'b' and 'i' are still on the stack, get rid of them so we
// don't have unbalanced stack issues in our loop
pop pop
// increment calldatapointer and perform same test as in main loop
0x20 add
calldatasize dup2 lt loop_start jumpi
// Repeat 'return' code to avoid a jump instruction and save some gas
0x20 0x10e0 mstore // stack state:
0x1120 msize sub // stack state: l
0x20 dup2 div 0x1100 mstore // stack state: l
0x40 add 0x10e0 return
// the variable number of 'pop' instructions in the main loop upsets the compiler, poor thing
// pop a stack variable to prevent it from throwing errors
pop
}
}
}