Skip to main content

SPO600 - Project - Stage Two

In the second stage of my SPO600 project, I will need to implement my optimizations to the function in the open source software that I have chosen, which is SSDUP. I will also need to prove that the optimized code produces the same results to the original code. I will also compare the performance of optimized function results between AArch64 and non AArch64 platforms.

As I mentioned in my stage one, SSDUP uses 3 different Murmur3 hash function, which is optimized for x86 and x64 platforms. The first function is for 32-bit machines and it produces a 32-bit output. The second function is also for 32-bit machines but it produces a 128-bit output. The third function is for 64-bit machines and it produces a 128-bit output. Each hash function will produce a different hash value.

Here is the source code of my file:
-----------------------------------------------------------------
#include "murmur3.h"
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#ifdef __GNUC__
#define FORCE_INLINE __attribute__((always_inline)) inline
#else
#define FORCE_INLINE
#endif
#define SIZE 100000

static FORCE_INLINE uint32_t rotl32 ( uint32_t x, int8_t r )
{
  return (x << r) | (x >> (32 - r));
}

static FORCE_INLINE uint64_t rotl64 ( uint64_t x, int8_t r )
{
  return (x << r) | (x >> (64 - r));
}

#define ROTL32(x,y) rotl32(x,y)
#define ROTL64(x,y) rotl64(x,y)

#define BIG_CONSTANT(x) (x##LLU)

//-----------------------------------------------------------------------------
// Block read - if your platform needs to do endian-swapping or can only
// handle aligned reads, do the conversion here

#define getblock(p, i) (p[i])

//-----------------------------------------------------------------------------
// Finalization mix - force all bits of a hash block to avalanche


static FORCE_INLINE uint32_t fmix32 ( uint32_t h )
{
  h ^= h >> 16;
  h *= 0x85ebca6b;
  h ^= h >> 13;
  h *= 0xc2b2ae35;
  h ^= h >> 16;

  return h;
}

//----------

static FORCE_INLINE uint64_t fmix64 ( uint64_t k )
{
  k ^= k >> 33;
  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
  k ^= k >> 33;
  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
  k ^= k >> 33;

  return k;
}

//-----------------------------------------------------------------------------
int main() {
  uint32_t hash[4];                /* Output for the hash */
  uint32_t seed = 42;              /* Seed value for hash */

  clock_t startTime1, startTime2, startTime3;
  clock_t endTime1, endTime2, endTime3;
  double totalTime1, totalTime2, totalTime3;

  char *samples = (char*)calloc(SIZE, sizeof(char));

  srand(1);

  //generate random characters

  for (int i = 0; i < SIZE; i++) {

            samples[i] = (random() % 26) + 'A';

  }

  /*if (argc != 2) {
    printf("usage: %s \"string to hash\"\n", argv[0]);
    exit(1);
  }

  printf("Input: \"%s\"\n", argv[1]);*/

  startTime1 = clock();  // Start time
  //MurmurHash3_x86_32(argv[1], strlen(argv[1]), seed, hash);
  for (int i = 0; i < SIZE; i++) {

  MurmurHash3_x86_32(samples, strlen(samples), seed, hash);

  }
  endTime1 = clock();  // End time
  totalTime1 = (double)(endTime1 - startTime1) / CLOCKS_PER_SEC;

  printf("x86_32:  %08x\n", hash[0]);
  printf("Total time: %lf seconds\n", totalTime1);

  startTime2 = clock(); // Start time
  //MurmurHash3_x86_128(argv[1], strlen(argv[1]), seed, hash);
  for (int i = 0; i < SIZE; i++) {

  MurmurHash3_x86_128(samples, strlen(samples), seed, hash);

  }
  endTime2 = clock();  // End time
  totalTime2 = (double)(endTime2 - startTime2) / CLOCKS_PER_SEC;

  printf("x86_128: %08x %08x %08x %08x\n",
         hash[0], hash[1], hash[2], hash[3]);
  printf("Total time: %lf seconds\n", totalTime2);

  startTime3 = clock(); // Start time
  //MurmurHash3_x64_128(argv[1], strlen(argv[1]), seed, hash);
  for (int i = 0; i < SIZE; i++) {

  MurmurHash3_x64_128(samples, strlen(samples), seed, hash);

  }
  endTime3 = clock();  // End time
  totalTime3 = (double)(endTime3 - startTime3) / CLOCKS_PER_SEC;

  printf("x64_128: %08x %08x %08x %08x\n",
         hash[0], hash[1], hash[2], hash[3]);
  printf("Total time: %lf seconds\n", totalTime3);

  return 0;
}


void MurmurHash3_x86_32 ( const void * key, int len,
                          uint32_t seed, void * out )
{
  const uint8_t * data = (const uint8_t*)key;
  const int nblocks = len / 4;
  int i;

  uint32_t h1 = seed;

  uint32_t c1 = 0xcc9e2d51;
  uint32_t c2 = 0x1b873593;

  //----------
  // body

  const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);

  for(i = -nblocks; i; i++)
  {
    uint32_t k1 = getblock(blocks,i);

    k1 *= c1;
    k1 = ROTL32(k1,15);
    k1 *= c2;
 
    h1 ^= k1;
    h1 = ROTL32(h1,13);
    h1 = h1*5+0xe6546b64;
  }

  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);

  uint32_t k1 = 0;

  switch(len & 3)
  {
  case 3: k1 ^= tail[2] << 16;
  case 2: k1 ^= tail[1] << 8;
  case 1: k1 ^= tail[0];
          k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };

  //----------
  // finalization

  h1 ^= len;

  h1 = fmix32(h1);

  *(uint32_t*)out = h1;
}

//-----------------------------------------------------------------------------

void MurmurHash3_x86_128 ( const void * key, const int len,
                           uint32_t seed, void * out )
{
  const uint8_t * data = (const uint8_t*)key;
  const int nblocks = len / 16;
  int i;

  uint32_t h1 = seed;
  uint32_t h2 = seed;
  uint32_t h3 = seed;
  uint32_t h4 = seed;

  uint32_t c1 = 0x239b961b;
  uint32_t c2 = 0xab0e9789;
  uint32_t c3 = 0x38b34ae5;
  uint32_t c4 = 0xa1e38b93;

  //----------
  // body

  const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);

  for(i = -nblocks; i; i++)
  {
    uint32_t k1 = getblock(blocks,i*4+0);
    uint32_t k2 = getblock(blocks,i*4+1);
    uint32_t k3 = getblock(blocks,i*4+2);
    uint32_t k4 = getblock(blocks,i*4+3);

    k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;

    h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;

    k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;

    h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;

    k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;

    h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;

    k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;

    h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
  }

  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

  uint32_t k1 = 0;
  uint32_t k2 = 0;
  uint32_t k3 = 0;
  uint32_t k4 = 0;

  switch(len & 15)
  {
  case 15: k4 ^= tail[14] << 16;
  case 14: k4 ^= tail[13] << 8;
  case 13: k4 ^= tail[12] << 0;
           k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;

  case 12: k3 ^= tail[11] << 24;
  case 11: k3 ^= tail[10] << 16;
  case 10: k3 ^= tail[ 9] << 8;
  case  9: k3 ^= tail[ 8] << 0;
           k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;

  case  8: k2 ^= tail[ 7] << 24;
  case  7: k2 ^= tail[ 6] << 16;
  case  6: k2 ^= tail[ 5] << 8;
  case  5: k2 ^= tail[ 4] << 0;
           k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;

  case  4: k1 ^= tail[ 3] << 24;
  case  3: k1 ^= tail[ 2] << 16;
  case  2: k1 ^= tail[ 1] << 8;
  case  1: k1 ^= tail[ 0] << 0;
           k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
  };

  //----------
  // finalization

  h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;

  h1 += h2; h1 += h3; h1 += h4;
  h2 += h1; h3 += h1; h4 += h1;

  h1 = fmix32(h1);
  h2 = fmix32(h2);
  h3 = fmix32(h3);
  h4 = fmix32(h4);

  h1 += h2; h1 += h3; h1 += h4;
  h2 += h1; h3 += h1; h4 += h1;

  ((uint32_t*)out)[0] = h1;
  ((uint32_t*)out)[1] = h2;
  ((uint32_t*)out)[2] = h3;
  ((uint32_t*)out)[3] = h4;
}

//-----------------------------------------------------------------------------

void MurmurHash3_x64_128 ( const void * key, const int len,
                           const uint32_t seed, void * out )
{
  const uint8_t * data = (const uint8_t*)key;
  const int nblocks = len / 16;
  int i;

  uint64_t h1 = seed;
  uint64_t h2 = seed;

  uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
  uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);

  //----------
  // body

  const uint64_t * blocks = (const uint64_t *)(data);

  for(i = 0; i < nblocks; i++)
  {
    uint64_t k1 = getblock(blocks,i*2+0);
    uint64_t k2 = getblock(blocks,i*2+1);

    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;

    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;

    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
  }

  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

  uint64_t k1 = 0;
  uint64_t k2 = 0;

  switch(len & 15)
  {
  case 15: k2 ^= (uint64_t)(tail[14]) << 48;
  case 14: k2 ^= (uint64_t)(tail[13]) << 40;
  case 13: k2 ^= (uint64_t)(tail[12]) << 32;
  case 12: k2 ^= (uint64_t)(tail[11]) << 24;
  case 11: k2 ^= (uint64_t)(tail[10]) << 16;
  case 10: k2 ^= (uint64_t)(tail[ 9]) << 8;
  case  9: k2 ^= (uint64_t)(tail[ 8]) << 0;
           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

  case  8: k1 ^= (uint64_t)(tail[ 7]) << 56;
  case  7: k1 ^= (uint64_t)(tail[ 6]) << 48;
  case  6: k1 ^= (uint64_t)(tail[ 5]) << 40;
  case  5: k1 ^= (uint64_t)(tail[ 4]) << 32;
  case  4: k1 ^= (uint64_t)(tail[ 3]) << 24;
  case  3: k1 ^= (uint64_t)(tail[ 2]) << 16;
  case  2: k1 ^= (uint64_t)(tail[ 1]) << 8;
  case  1: k1 ^= (uint64_t)(tail[ 0]) << 0;
           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
  };

  //----------
  // finalization

  h1 ^= len; h2 ^= len;

  h1 += h2;
  h2 += h1;

  h1 = fmix64(h1);
  h2 = fmix64(h2);

  h1 += h2;
  h2 += h1;

  ((uint64_t*)out)[0] = h1;
  ((uint64_t*)out)[1] = h2;
}


-----------------------------------------------------------------

In stage one, I have benchmarked all 3 hash function. Here is the result:
-----------------------------------------------------------------
[qichang@aarchie hash]$ g++ -o benchmark benchmark.c
[qichang@aarchie hash]$ ./benchmark
x86_32:  48f2284b
Total time: 14.185739 seconds
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.185568 seconds
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.157431 seconds
-----------------------------------------------------------------
The first function spent: 14.185739 seconds
The second function spent: 13.185568 seconds
The third function spent: 8.157431 seconds
-----------------------------------------------------------------

In order to compare the results for each hash function with and without optimization, I decided to split these 3 hash functions, I will only execute one hash function at a time.
-rw-rw-r--. benchmark64128.c
-rw-rw-r--. benchmark86128.c
-rw-rw-r--. benchmark8632.c

As the optimization strategies, I mentioned in stage one, I will first go with the easiest way, which is to use altered build options to compile those hash functions. The second strategy I said I would use inline assembly to optimize it, but after I considered, it seems too difficult, and I don't think that there are many chances for me to improve the performance.

In the Makefile of SSDUP, I noticed that the software use 'gcc -std=gnu99' to compile, which it doesn't use any optimization option to compile and build SSDUP, so the result would be same as my stage one benchmark results. But since I split those into different files, I will do it again for each file. And then I will compile my benchmarked program with the -O3 option instead of compile without optimization options. I am going to run each file 5 times in order to produce an accurate result.
-----------------------------------------------------------------
First hash function (MurmurHash3_x86_32):
[qichang@aarchie hash]$ gcc -std=gnu99 -o benchmark8632 benchmark8632.c
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 14.248357 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 14.177813 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 14.190903 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 14.117388 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 14.223588 seconds

[qichang@aarchie hash]$ gcc -std=gnu99 -O3 -o benchmark8632 benchmark8632.c
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.574737 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.572717 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.573482 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.574128 seconds
[qichang@aarchie hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.573317 seconds
-----------------------------------------------------------------
Second hash function (MurmurHash3_x86_128):
[qichang@aarchie hash]$ gcc -std=gnu99 -o benchmark86128 benchmark86128.c
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.310612 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.332081 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.294588 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.240154 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 13.247286 seconds

[qichang@aarchie hash]$ gcc -std=gnu99 -O3 -o benchmark86128 benchmark86128.c
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 1.337854 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 1.338757 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 1.341055 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 1.339572 seconds
[qichang@aarchie hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 1.340968 seconds
-----------------------------------------------------------------
Third hash function (MurmurHash3_x64_128):
[qichang@aarchie hash]$ gcc -std=gnu99 -o benchmark64128 benchmark64128.c
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.195865 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.179205 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.190692 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.193841 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.234855 seconds

[qichang@aarchie hash]$ gcc -std=gnu99 -O3 -o benchmark64128 benchmark64128.c
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.304771 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.308439 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.315429 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.316185 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.314772 seconds
-----------------------------------------------------------------
For the first hash function (MurmurHash3_x86_32), the exectution time for -O3 is about 802% faster than without compilation option.
For the second hash function (MurmurHash3_x86_128), the execution time for -O3 is about 891% faster than without compilation option.
For the third hash function (MurmurHash3_x64_128), the execution time for -O3 is about 523% faster than without compilation option.

The first two hash functions, there has been optimized for x86 platforms, it has a huge improved in performance after using compilation build option -O3, the third hash function has been optimized for x64 platforms, it also has a huge difference than not using -O3, but not much than x86 platforms.

After using the compilation build option, I am going to improve the existing algorithms to optimize the functions. For this part, I will only try to optimize the third hash function, because it has been optimized for x64 platforms. Within the hash function, there is a function called getblock with the second parameter that is provided to the function is i*2+0 or i*2+1, I will use an addition operation instead of multiplication operation to test if there is any improvement in performance. Secondary, at the beginning of the x64 hash function, there is a code that nblocks = len / 16, and within the function, there is a line of code that performs the operation nblocks*16. The values of nblocks and len do not change the function, they are both constants and integer data type. So, nblocks*16 = len, and I can eliminate one operation by replacing the multiplication operation with variable len to test if there is any improvement in performance.
Here are what I changed:
-----------------------------------------------------------------
Before:
uint64_t k1 = getblock(blocks,i*2+0);
uint64_t k2 = getblock(blocks,i*2+1);

After:
uint64_t k1 = getblock(blocks,i+i+0);
uint64_t k2 = getblock(blocks,i+i+1);
-----------------------------------------------------------------
Before:
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

After:
const uint8_t * tail = (const uint8_t*)(data + len);
-----------------------------------------------------------------

Here is the result:
-----------------------------------------------------------------
[qichang@aarchie hash]$ gcc -std=gnu99 -o benchmark64128 benchmark64128.c
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.145001 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.143322 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.137052 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.195385 seconds
[qichang@aarchie hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 8.178763 seconds
-----------------------------------------------------------------
The average time of this hash function without changed is 8.1984 seconds.
The average time of this hash function with changed is 8.1596 seconds.
There is 0.04% faster with code changed.

In the next step, I am going to optimize the hash functions on an x86 system. Same as before, I will first compile my benchmark program without -O3 option, and then compile again with -O3 option, to compare the results if there is any improvement in the hash functions. Here are the results:
-----------------------------------------------------------------
First hash function (MurmurHash3_x86_32):
[qichang@xerxes hash]$ gcc -std=gnu99 -o benchmark8632 benchmark8632.c
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.321532 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.729160 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.325567 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.336685 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.720984 seconds

[qichang@xerxes hash]$ gcc -std=gnu99 -O3 -o benchmark8632 benchmark8632.c
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.144952 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.145310 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.144949 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.147177 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 1.144718 seconds
-----------------------------------------------------------------
Second hash function (MurmurHash3_x86_128):
[qichang@xerxes hash]$ gcc -std=gnu99 -o benchmark86128 benchmark86128.c
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.496733 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.512100 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.519267 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.516495 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.517119 seconds

[qichang@xerxes hash]$ gcc -std=gnu99 -O3 -o benchmark86128 benchmark86128.c
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 0.983106 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 0.982542 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 0.982342 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 0.982931 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 0.982379 seconds
-----------------------------------------------------------------
Third hash function (MurmurHash3_x64_128):
[qichang@xerxes hash]$ gcc -std=gnu99 -o benchmark64128 benchmark64128.c
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 4.855595 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 4.855948 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 4.849469 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 4.855682 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 4.848947 seconds

[qichang@xerxes hash]$ gcc -std=gnu99 -O3 -o benchmark64128 benchmark64128.c
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.293736 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.298314 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.296543 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.296466 seconds
[qichang@xerxes hash]$ ./benchmark64128
x64_128: 1fe39d94 98f9f86a 9f7dfcd5 f01f2d58
Total time: 1.294762 seconds
-----------------------------------------------------------------

For the first hash function (MurmurHash3_x86_32), average execution time without -O3 is about 9.49 seconds, with -O3 is about 1.15 seconds. The execution time with -O3 is about 725% faster than without compilation option.
For the second hash function (MurmurHash3_x86_128), average execution time without -O3 is about 8.51 seconds, with -O3 is about 1.14 seconds. The execution time with -O3 is about 646% faster than without compilation option.
For the third hash function (MurmurHash3_x86_128), average execution time without -O3 is about 4.85 seconds, with -O3 is about 1.29 seconds. The execution time with -O3 is about 275% faster than without compilation option.

The first two hash functions, which have been optimized for x86 platforms, have significantly improved in performance after compiling with -O3 option. The third hash function, which has been optimized for x64 platforms, has a slightly poorer performance after compiling with -O3 option.

In the next step of optimization, I will try to optimize the first two hash function, because of the first two hash function already optimized for x86 platforms. Like the ones I changed in the third function, in the first function, nblocks*4 = len, I will remove one operator by replacing the multiplication operator with the constant integer len to see if there is any improvement. Here are the code changed:
First function:
-----------------------------------------------------------------
Before:
const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
const uint8_t * tail = (const uint8_t*)(data + nblocks*4);

After:
const uint32_t * blocks = (const uint32_t *)(data + len);
const uint8_t * tail = (const uint8_t*)(data + len);
-----------------------------------------------------------------
Second function:
-----------------------------------------------------------------
Before:
const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

After:
const uint32_t * blocks = (const uint32_t *)(data + len);
const uint8_t * tail = (const uint8_t*)(data + len);
-----------------------------------------------------------------

Here are the results:
-----------------------------------------------------------------
First hash function with code changes (MurmurHash3_x86_32):
[qichang@xerxes hash]$ gcc -std=gnu99 -o benchmark8632 benchmark8632.c
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.726852 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.724399 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.724434 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.726194 seconds
[qichang@xerxes hash]$ ./benchmark8632
x86_32:  48f2284b
Total time: 9.360804 seconds
-----------------------------------------------------------------
Second hash function (MurmurHash3_x86_128):
[qichang@xerxes hash]$ gcc -std=gnu99 -o benchmark86128 benchmark86128.c
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.516874 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.523627 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.537537 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.528274 seconds
[qichang@xerxes hash]$ ./benchmark86128
x86_128: afcbbbf4 e795fb5f 25f06a82 bdb0d389
Total time: 8.525721 seconds
-----------------------------------------------------------------

For the first hash function (MurmurHash3_x86_32), average execution time without code changed is about 9.49 seconds, with code changed, is about 9.65 seconds.
For the second hash function (MurmurHash3_x86_128), average execution time without code changed is about 8.51 seconds, with code changed, is about 8.52 seconds.

All of the tests are first completed on an AArch64 system. My first step to optimize the hash function is to compile my benchmark program with -O3 compilation option. The first two hash functions, which have been optimized for x86 platforms, which has a significant improvement in performance. The third hash function, which has been optimized for x64 platforms, after compiling with -O3 option, which is a very small improvement in performance. My second step in optimization is to change some code in the third function, there is 0.04% faster than without changing the code.

Afterward, I perform the benchmark program on an x86_64 system, the results show that it also has a significant improvement in performance if compiling with -O3 option. But the improvement of the third function on an AArch64 system is not as much as different than x86_64 platforms. After that, I also change some code in the first two hash function for improvement in performance. What I disappointed is, after I changed the code and I thought it would have some improvement, but the truth is negative. The results show it slower than without code changed. In the end, compiling with compilation option -O3 is the best way for improvement in hash function performance.

Comments

Popular posts from this blog

Lab2

Complied C Lab In this lab, we were asked to compile a C program, using gcc command with different options. At the beginning of this lab, we wrote a simple C program that prints a message: Then using gcc command and the following compiler options to compile the program: -g # enable debugging information -O0 # do not optimize (that's a capital letter and then the digit zero) -fno-builtin # do not use builtin function optimizations Note that the size of file is 73088 bytes We can use objdump --source a.out command to show source code, the source code is under <main> section. And  readelf -p .rodata a.out contains the string to be printed. Then we add the option "-static" to recompile the program, found out the size is changed to 696264 bytes, which is bigger than the original program. And section headers are also increased. Next, I removed the builtin function optimization by remove option "-fno-builtin"...

Lab 6A

This lab is separated into two parts, I'll blog my work in different post. In the first part, we've got a source code from professor Chris, which is a similar stuff to our lab5, scaling the volume of sound, but it includes inline assembler. The first thing I'll do is add a timer to the code in order to check the performing time. Build and run the program, here is the output: ------------------------------------------------------------------------- [qichang@aarchie spo600_20181_inline_assembler_lab]$ ./vol_simd Generating sample data. Scaling samples. Summing samples. Result: -462 Time: 0.024963 seconds. ------------------------------------------------------------------------- Then I adjusted the number of samples to 5000000 in vol.h: ------------------------------------------------------------------------- [qichang@aarchie spo600_20181_inline_assembler_lab]$ cat vol_simd.c // vol_simd.c :: volume scaling in C using AArch64 SIMD // Chris Tyler 2017.11.29-2018...

Lab 5

In this lab, we are going to use different approaches to scale volume of sound, and the algorithm’s effect on system performance. Here is some basic knowledge of digital sound: Digital sound is usually represented by a signed 16-bit integer signal sample, taken at a rate of around 44.1 or 48 thousand samples per second for one stream of samples for the left and right stereo channels. In order to change the volume of sound, we will have to scale the volume factor for each sample, the range of 0.00 to 1.00 (silence to full volume). Here is the source code I got from professor: (vol1.h) ------------------------------------------------- #include <stdlib.h> #include <stdio.h> #include <stdint.h> #include "vol.h" // Function to scale a sound sample using a volume_factor // in the range of 0.00 to 1.00. static inline int16_t scale_sample(int16_t sample, float volume_factor) { return (int16_t) (volume_factor * (float) sample); } int main() { // Al...