This lab is separated into two parts, I'll blog my work in different post.
In the first part, we've got a source code from professor Chris, which is a similar stuff to our lab5, scaling the volume of sound, but it includes inline assembler.
The first thing I'll do is add a timer to the code in order to check the performing time. Build and run the program, here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ ./vol_simd
Generating sample data.
Scaling samples.
Summing samples.
Result: -462
Time: 0.024963 seconds.
-------------------------------------------------------------------------
Then I adjusted the number of samples to 5000000 in vol.h:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ cat vol_simd.c
// vol_simd.c :: volume scaling in C using AArch64 SIMD
// Chris Tyler 2017.11.29-2018.02.20
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"
int main() {
int16_t* in; // input array
int16_t* limit; // end of input array
int16_t* out; // output array
// these variables will be used in our assembler code, so we're going
// to hand-allocate which register they are placed in
// Q: what is an alternate approach?
register int16_t* in_cursor asm("r20"); // input cursor
register int16_t* out_cursor asm("r21"); // output cursor
register int16_t vol_int asm("r22"); // volume as int16_t
int x; // array interator
int ttl; // array total
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
srand(-1);
printf("Generating sample data.\n");
for (x = 0; x < SAMPLES; x++) {
in[x] = (rand()%65536)-32768;
}
// --------------------------------------------------------------------
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES ;
// set vol_int to fixed-point representation of 0.75
// Q: should we use 32767 or 32768 in next line? why?
vol_int = (int16_t) (0.75 * 32767.0);
printf("Scaling samples.\n");
// Q: what does it mean to "duplicate" values in the next line?
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
while ( in_cursor < limit ) {
__asm__ (
"ldr q0, [%[in]],#16 \n\t"
// load eight samples into q0 (v0.8h)
// from in_cursor, and post-increment
// in_cursor by 16 bytes
"sqdmulh v0.8h, v0.8h, v1.8h \n\t"
// multiply each lane in v0 by v1*2
// saturate results
// store upper 16 bits of results into v0
"str q0, [%[out]],#16 \n\t"
// store eight samples to out_cursor
// post-increment out_cursor by 16 bytes
// Q: what happens if we remove the following
// two lines? Why?
: [in]"+r"(in_cursor)
: "0"(in_cursor),[out]"r"(out_cursor)
);
}
// --------------------------------------------------------------------
printf("Summing samples.\n");
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// Q: are the results usable? are they correct?
printf("Result: %d\n", ttl);
return 0;
}
-------------------------------------------------------------------------
Here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ ./vol_simd
Generating sample data.
Scaling samples.
Summing samples.
Result: 362
Time: 0.251086 seconds.
-------------------------------------------------------------------------
Look back the solution I did in lab5, and I changed the number of samples to 5000000:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_vol_skel]$ cat vol2.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"
#include <time.h>
// Function to scale a sound sample using a volume_factor
// in the range of 0.00 to 1.00.
static inline int16_t scale_sample(int16_t sample, float volume_factor) {
return (int16_t) (volume_factor * (float) sample);
}
int main() {
// Allocate memory for large in and out arrays
int16_t* in;
int16_t* out;
int16_t* lookupTable = calloc(65536, sizeof(int16_t));
in = (int16_t*) calloc(SAMPLES, sizeof(int16_t));
out = (int16_t*) calloc(SAMPLES, sizeof(int16_t));
int x;
int ttl;
clock_t start, end, total;
// Seed the pseudo-random number generator
srand(-1);
// Fill the array with random data
for (x = 0; x < SAMPLES; x++) {
in[x] = (rand()%65536)-32768;
}
// Initialize lookup table by multiplied by the volume factor
for(int i=0; i<65536; i++){
lookupTable[i] = (i-32768) * 0.75;
}
start = clock();
// ######################################
// This is the interesting part!
// Scale the volume of all of the samples
for (x = 0; x < SAMPLES; x++) {
out[x] = scale_sample(in[x], 0.75);
}
// ######################################
// Sum up the data
for (x = 0; x < SAMPLES; x++) {
in[x] = (lookupTable[in[x] + 32768]);
ttl = (ttl+out[x])%1000;
}
end = clock();
total = end - start;
for(int i =0; i<10; i++){
printf("%d \n", in[i]);
}
// Print the sum
printf("Result: %d\n", ttl);
printf("Time spending for scaling the sound samples: %f seconds.\n",
(float) total/CLOCKS_PER_SEC);
return 0;
}
-------------------------------------------------------------------------
Here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_vol_skel]$ ./vol2
17516
10521
8070
12072
-789
9538
1708
-15747
-20292
-15138
Result: 668
Time spending for scaling the sound samples: 0.036603 seconds.
-------------------------------------------------------------------------
// Q: what is an alternate approach?
register int16_t* in_cursor asm("r20"); // input cursor
register int16_t* out_cursor asm("r21"); // output cursor
register int16_t vol_int asm("r22"); // volume as int16_t
A: Let the compiler assign register by itself, not given.
-------------------------------------------------------------------------
// Q: should we use 32767 or 32768 in next line? why?
vol_int = (int16_t) (0.75 * 32767.0);
A: Because int16_t has its maximum data , which is 32767.
-------------------------------------------------------------------------
// Q: what does it mean to "duplicate" values in the next line?
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
A: It means the register given, w0, is used to v1.8h.
-------------------------------------------------------------------------
// Q: what happens if we remove the following
// two lines? Why?
: [in]"+r"(in_cursor)
: "0"(in_cursor),[out]"r"(out_cursor)
);
A: It couldn't even complie.
[qichang@aarchie spo600_20181_inline_assembler_lab]$ make
gcc -g -O3 vol_simd.c -o vol_simd
vol_simd.c: In function ‘main’:
vol_simd.c:71:4: error: undefined named operand ‘in’
);
^
vol_simd.c:71:4: error: undefined named operand ‘out’
make: *** [Makefile:7: vol_simd] Error 1
-------------------------------------------------------------------------
// Q: are the results usable? are they correct?
A: The results are usable, and they are correct.
In the first part, we've got a source code from professor Chris, which is a similar stuff to our lab5, scaling the volume of sound, but it includes inline assembler.
The first thing I'll do is add a timer to the code in order to check the performing time. Build and run the program, here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ ./vol_simd
Generating sample data.
Scaling samples.
Summing samples.
Result: -462
Time: 0.024963 seconds.
-------------------------------------------------------------------------
Then I adjusted the number of samples to 5000000 in vol.h:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ cat vol_simd.c
// vol_simd.c :: volume scaling in C using AArch64 SIMD
// Chris Tyler 2017.11.29-2018.02.20
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"
int main() {
int16_t* in; // input array
int16_t* limit; // end of input array
int16_t* out; // output array
// these variables will be used in our assembler code, so we're going
// to hand-allocate which register they are placed in
// Q: what is an alternate approach?
register int16_t* in_cursor asm("r20"); // input cursor
register int16_t* out_cursor asm("r21"); // output cursor
register int16_t vol_int asm("r22"); // volume as int16_t
int x; // array interator
int ttl; // array total
in=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
out=(int16_t*) calloc(SAMPLES, sizeof(int16_t));
srand(-1);
printf("Generating sample data.\n");
for (x = 0; x < SAMPLES; x++) {
in[x] = (rand()%65536)-32768;
}
// --------------------------------------------------------------------
in_cursor = in;
out_cursor = out;
limit = in + SAMPLES ;
// set vol_int to fixed-point representation of 0.75
// Q: should we use 32767 or 32768 in next line? why?
vol_int = (int16_t) (0.75 * 32767.0);
printf("Scaling samples.\n");
// Q: what does it mean to "duplicate" values in the next line?
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
while ( in_cursor < limit ) {
__asm__ (
"ldr q0, [%[in]],#16 \n\t"
// load eight samples into q0 (v0.8h)
// from in_cursor, and post-increment
// in_cursor by 16 bytes
"sqdmulh v0.8h, v0.8h, v1.8h \n\t"
// multiply each lane in v0 by v1*2
// saturate results
// store upper 16 bits of results into v0
"str q0, [%[out]],#16 \n\t"
// store eight samples to out_cursor
// post-increment out_cursor by 16 bytes
// Q: what happens if we remove the following
// two lines? Why?
: [in]"+r"(in_cursor)
: "0"(in_cursor),[out]"r"(out_cursor)
);
}
// --------------------------------------------------------------------
printf("Summing samples.\n");
for (x = 0; x < SAMPLES; x++) {
ttl=(ttl+out[x])%1000;
}
// Q: are the results usable? are they correct?
printf("Result: %d\n", ttl);
return 0;
}
-------------------------------------------------------------------------
Here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_inline_assembler_lab]$ ./vol_simd
Generating sample data.
Scaling samples.
Summing samples.
Result: 362
Time: 0.251086 seconds.
-------------------------------------------------------------------------
Look back the solution I did in lab5, and I changed the number of samples to 5000000:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_vol_skel]$ cat vol2.c
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include "vol.h"
#include <time.h>
// Function to scale a sound sample using a volume_factor
// in the range of 0.00 to 1.00.
static inline int16_t scale_sample(int16_t sample, float volume_factor) {
return (int16_t) (volume_factor * (float) sample);
}
int main() {
// Allocate memory for large in and out arrays
int16_t* in;
int16_t* out;
int16_t* lookupTable = calloc(65536, sizeof(int16_t));
in = (int16_t*) calloc(SAMPLES, sizeof(int16_t));
out = (int16_t*) calloc(SAMPLES, sizeof(int16_t));
int x;
int ttl;
clock_t start, end, total;
// Seed the pseudo-random number generator
srand(-1);
// Fill the array with random data
for (x = 0; x < SAMPLES; x++) {
in[x] = (rand()%65536)-32768;
}
// Initialize lookup table by multiplied by the volume factor
for(int i=0; i<65536; i++){
lookupTable[i] = (i-32768) * 0.75;
}
start = clock();
// ######################################
// This is the interesting part!
// Scale the volume of all of the samples
for (x = 0; x < SAMPLES; x++) {
out[x] = scale_sample(in[x], 0.75);
}
// ######################################
// Sum up the data
for (x = 0; x < SAMPLES; x++) {
in[x] = (lookupTable[in[x] + 32768]);
ttl = (ttl+out[x])%1000;
}
end = clock();
total = end - start;
for(int i =0; i<10; i++){
printf("%d \n", in[i]);
}
// Print the sum
printf("Result: %d\n", ttl);
printf("Time spending for scaling the sound samples: %f seconds.\n",
(float) total/CLOCKS_PER_SEC);
return 0;
}
-------------------------------------------------------------------------
Here is the output:
-------------------------------------------------------------------------
[qichang@aarchie spo600_20181_vol_skel]$ ./vol2
17516
10521
8070
12072
-789
9538
1708
-15747
-20292
-15138
Result: 668
Time spending for scaling the sound samples: 0.036603 seconds.
-------------------------------------------------------------------------
// Q: what is an alternate approach?
register int16_t* in_cursor asm("r20"); // input cursor
register int16_t* out_cursor asm("r21"); // output cursor
register int16_t vol_int asm("r22"); // volume as int16_t
A: Let the compiler assign register by itself, not given.
-------------------------------------------------------------------------
// Q: should we use 32767 or 32768 in next line? why?
vol_int = (int16_t) (0.75 * 32767.0);
A: Because int16_t has its maximum data , which is 32767.
-------------------------------------------------------------------------
// Q: what does it mean to "duplicate" values in the next line?
__asm__ ("dup v1.8h,%w0"::"r"(vol_int)); // duplicate vol_int into v1.8h
A: It means the register given, w0, is used to v1.8h.
-------------------------------------------------------------------------
// Q: what happens if we remove the following
// two lines? Why?
: [in]"+r"(in_cursor)
: "0"(in_cursor),[out]"r"(out_cursor)
);
A: It couldn't even complie.
[qichang@aarchie spo600_20181_inline_assembler_lab]$ make
gcc -g -O3 vol_simd.c -o vol_simd
vol_simd.c: In function ‘main’:
vol_simd.c:71:4: error: undefined named operand ‘in’
);
^
vol_simd.c:71:4: error: undefined named operand ‘out’
make: *** [Makefile:7: vol_simd] Error 1
-------------------------------------------------------------------------
// Q: are the results usable? are they correct?
A: The results are usable, and they are correct.
Comments
Post a Comment