diff --git a/1D-FFT/fft-opt1.cu b/1D-FFT/fft-opt1.cu new file mode 100644 index 0000000..333f50e --- /dev/null +++ b/1D-FFT/fft-opt1.cu @@ -0,0 +1,191 @@ +// One of the possible optimizations to the implementation in fft.cu +// +// The algorithm is as follows: +// +// for (s = 0 to log2(N)) ........ loop1 +// m = 2^i; +// statements +// for (j=0;j +#include +#include + +typedef float2 Complex; + +#define THREADS 32 +#define MAX_NO_OF_THREADS_PER_BLOCK 1024 + +const long long ARRAY_SIZE = 65536; +const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex); + +// Bit reversal re-ordering, first step of FFT +__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int rev = __brev(id) >> (32-s); + + if(id < ARRAY_SIZE) + d_rev[rev] = d_a[id]; +} + +// Work of the innermost loop, common to both parallelization +__device__ void inplace_fft(Complex *a, int j, int k, int m){ + + if (j+k+m/2 < ARRAY_SIZE){ + + Complex w, t, u; + + // w^k (w is root of unity) + w.x = __cosf((2*M_PI*k)/m); + w.y = -__sinf((2*M_PI*k)/m); + + // u = a[j+k] + u.x = a[j+k].x; + u.y = a[j+k].y; + + // t = w*a[j+k+m/2]; + t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y; + t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x; + + // a[j+k] = u+t; + a[j+k].x = u.x + t.x; + a[j+k].y = u.y + t.y; + + // a[j+k+m/2] = u-t; + a[j+k+m/2].x = u.x - t.x; + a[j+k+m/2].y = u.y - t.y; + + } +} + +// Parallelization of loop2 +__global__ void fft_outer(Complex *a, int m){ + int j = (blockIdx.x * blockDim.x + threadIdx.x)*m; + if (j < ARRAY_SIZE){ + for (int k=0;k>>(d_rev, d_a, s); + + cudaDeviceSynchronize(); + + // FFT driver code + for (int i=1;i<=s;i++){ + + // m = 2^i + int m = 1 << i; + + if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){ + + fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m); + + } else { + + for (int j=0;j>>(d_rev,j,m); + + } + } + } + + //End of performance measurement + cudaEventRecord(stop); + + //Block CPU execution until the event "stop" is recorded + cudaEventSynchronize(stop); + + //Print the time taken in milliseconds + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("The total time taken is %f milliseconds\n", milliseconds); + + // Copy result array from device to host + cudaMemcpy(h_rev,d_rev,ARRAY_BYTES,cudaMemcpyDeviceToHost); + + // Writing output to files + fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n"); + for (int i = 0; i < ARRAY_SIZE; i++) + { + fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i]), h_rev[i].x, h_rev[i].y); + } + + // Free allocated device memory + cudaFree(d_a); + cudaFree(d_rev); + + return 0; +} \ No newline at end of file diff --git a/1D-FFT/fft-serialized.c b/1D-FFT/fft-serialized.c new file mode 100644 index 0000000..e7dd538 --- /dev/null +++ b/1D-FFT/fft-serialized.c @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include + +#ifndef M_PI + #define M_PI 3.14159265358979323846 +#endif + +#define ARRAY_SIZE 131072 + +typedef struct Complex{ + float re,im; +} Complex; + +const long long ARRAY_BYTES = ARRAY_SIZE*sizeof(Complex); + +int bit_reverse(int n, int s){ + int rev=0, count=s; + while(n){ + rev = rev << 1; + rev |= n & 1; + n = n >> 1; + count--; + } + rev = rev << count; + return rev; +} + +void bit_reverse_reorder(Complex *a, Complex *rev, int s){ + for(int i=0;i +#include +#include + +#define ll long long int +#define THREADS 32 + +typedef float2 Complex; + +const long long ARRAY_SIZE = 1024; +const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex); + +// Parallelized reordering (Doesn't this count as pre-processing?) +__global__ void bit_reverse_reorder (Complex *d_rev, Complex *d_a, int s){ + int id = (blockIdx.x * blockDim.x) + threadIdx.x; + int rev = __brev(id) >> (32-s); + if (id < ARRAY_SIZE) + d_rev[rev] = d_a[id]; +} + +// FFT driver kernel code +__global__ void fft(Complex *a, int j, int m){ + int k = blockIdx.x * blockDim.x + threadIdx.x; + + if(k < m/2 && j+k+m/2 < ARRAY_SIZE){ + + Complex w, t, u; + + // w^k (w is root of unity) + w.x = __cosf((2*M_PI*k)/m); + w.y = -__sinf((2*M_PI*k)/m); + + // u = a[j+k] + u.x = a[j+k].x; + u.y = a[j+k].y; + + // t = w*a[j+k+m/2]; + t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y; + t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x; + + // a[j+k] = u+t; + a[j+k].x = u.x + t.x; + a[j+k].y = u.y + t.y; + + // a[j+k+m/2] = u-t; + a[j+k+m/2].x = u.x - t.x; + a[j+k+m/2].y = u.y - t.y; + + } +} + +float magnitude(float2 a) +{ + return sqrt(a.x*a.x + a.y*a.y); +} + +int main(int argc, char *argv[]) { + + //Measuring performance + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Creating files to write output to + FILE *fptr; + fptr = fopen("fft-output.dat", "wr"); + + //Creating Complex arrays for data + Complex h_a[ARRAY_SIZE], h_rev[ARRAY_SIZE]; + Complex *d_a, *d_rev; + + // Input signal (of the form sin((2*M_PI*f*x)/N)) where N is the sample size + // imaginary part of the signal by default is 0 + for(int i = 0; i < ARRAY_SIZE; i++) { + h_a[i].x = sin((10*M_PI*i)/ARRAY_SIZE); + h_a[i].y = 0.0; + } + + // No. of bits in the sample size, used for bit reversal reordering + int s = (int)ceil(log2(ARRAY_SIZE)); + + //Allocate memory for the device arrays + cudaMalloc((void**) &d_a, ARRAY_BYTES); + cudaMalloc((void**) &d_rev, ARRAY_BYTES); + + //Copy all elements of sample array from host to device + cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice); + + //Start of performance measurement + cudaEventRecord(start); + + //Reorder the sample as first step of FFT + bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s); + + //Synchronise devices before jumping to fft + cudaDeviceSynchronize(); + + // Naive fft parallelization (TODO: improve upon the efficiency) + for (int i=0;i<=s;i++){ + + int m = 1 << i; + + for(int j=0;j>>(d_rev,j,m); + + } + } + + //End of performance measurement + cudaEventRecord(stop); + + //Block CPU execution until the event "stop" is recorded + cudaEventSynchronize(stop); + + //Print the time taken in milliseconds. + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("The total time taken is %f milliseconds\n", milliseconds); + + // Copy result array to host + cudaMemcpy(h_rev, d_rev, ARRAY_BYTES, cudaMemcpyDeviceToHost); + + // Writing output to files + fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n"); + for (int i = 0; i < ARRAY_SIZE; i++) + { + fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i]), h_rev[i].x, h_rev[i].y); + } + + // Free device memory + cudaFree(d_a); + cudaFree(d_rev); + + // TODO + // Use h_rev to plot magnitude (sqrt(h_rev[i].x^2 + h_rev[i].y^2)) vs frequency (i) +} \ No newline at end of file diff --git a/1D-FFT/inv_fft.cu b/1D-FFT/inv_fft.cu new file mode 100644 index 0000000..c8f4214 --- /dev/null +++ b/1D-FFT/inv_fft.cu @@ -0,0 +1,242 @@ +#include +#include +#include + +typedef float2 Complex; + +#define THREADS 32 +#define MAX_NO_OF_THREADS_PER_BLOCK 1024 + +long long ARRAY_SIZE; +long long ARRAY_BYTES; + +__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s, long long ARRAY_SIZE) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + int rev = __brev(id) >> (32-s); + + if(id < ARRAY_SIZE) + d_rev[rev] = d_a[id]; +} + +__global__ void swap_real_and_imaginary(Complex *d_rev, long long ARRAY_SIZE) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id < ARRAY_SIZE) { + float temp = d_rev[id].x; + d_rev[id].x = d_rev[id].y; + d_rev[id].y = temp; + } +} + +__global__ void clip(Complex *d_rev, long long cut_off_frequency, long long ARRAY_SIZE) { + int id = blockIdx.x * blockDim.x + threadIdx.x; + if (id >= cut_off_frequency && id <= (ARRAY_SIZE - cut_off_frequency)) { + d_rev[id].x = 0.0f; + d_rev[id].y = 0.0f; + } +} + +__device__ void inplace_fft(Complex *a, int j, int k, int m, long long ARRAY_SIZE){ + + if (j+k+m/2 < ARRAY_SIZE){ + + Complex w, t, u; + + // w^k (w is root of unity) + w.x = __cosf((2*M_PI*k)/m); + w.y = -__sinf((2*M_PI*k)/m); + + // u = a[j+k] + u.x = a[j+k].x; + u.y = a[j+k].y; + + // t = w*a[j+k+m/2]; + t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y; + t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x; + + // a[j+k] = u+t; + a[j+k].x = u.x + t.x; + a[j+k].y = u.y + t.y; + + // a[j+k+m/2] = u-t; + a[j+k+m/2].x = u.x - t.x; + a[j+k+m/2].y = u.y - t.y; + + } +} + +__global__ void fft_outer(Complex *a, int m, long long ARRAY_SIZE){ + int j = (blockIdx.x * blockDim.x + threadIdx.x)*m; + if (j < ARRAY_SIZE){ + for (int k=0;k= 2) + cut_off_frequency = to_int(argv[1]); + + else + cut_off_frequency = 1000; + + // printf("%d\n", cut_off_frequency); + + + //Measuring performance + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + //Creating file to read input from + FILE *input; + input = fopen("../utils/input.dat", "r"); + + int fs; + + fscanf(input, "%d", &fs); + + fscanf(input, "%Ld", &ARRAY_SIZE); + + ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex); + + // Creating files to write output to + FILE *fptr; + FILE *output; + + fptr = fopen("../utils/fft-opt1-output.dat", "wr"); + output = fopen("../utils/convert_to_wav.dat", "wr"); + + Complex *h_a = (Complex *)malloc(ARRAY_BYTES); + Complex *h_rev = (Complex *)malloc(ARRAY_BYTES); + + for(int i = 0; i < ARRAY_SIZE; i++) + { + fscanf(input, "%f", &h_a[i].x); + // h_a[i].x = sin((6.0*M_PI*i)/ARRAY_SIZE); + // printf("%f\n", h_a[i].x); + h_a[i].y = 0.0; + } + + int s = (int)ceil(log2(ARRAY_SIZE)); + + Complex *d_a, *d_rev, *d_rev1; + + cudaMalloc((void**) &d_a, ARRAY_BYTES); + cudaMalloc((void**) &d_rev, ARRAY_BYTES); + cudaMalloc((void**) &d_rev1, ARRAY_BYTES); + + cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice); + + //Start of performance measurement + cudaEventRecord(start); + + bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s, ARRAY_SIZE); + + cudaDeviceSynchronize(); + + for (int i=1;i<=s;i++){ + int m = 1 << i; + if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){ + fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m,ARRAY_SIZE); + } else { + for (int j=0;j>>(d_rev,j,m,ARRAY_SIZE); + } + } + } + + cudaDeviceSynchronize(); + + // Clipping + clip<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, cut_off_frequency, ARRAY_SIZE); + + // Beginning of inverse FFT + + swap_real_and_imaginary<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, ARRAY_SIZE); + + cudaDeviceSynchronize(); + + bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev1, d_rev, s, ARRAY_SIZE); + + cudaDeviceSynchronize(); + + for (int i=1;i<=s;i++) + { + int m = 1 << i; + if (m < sqrt(ARRAY_SIZE / 4)) + { + fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev1,m,ARRAY_SIZE); + } + else + { + for (int j=0;j>>(d_rev1,j,m,ARRAY_SIZE); + } + } + } + + swap_real_and_imaginary<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev1, ARRAY_SIZE); + + // End of performance measurement + cudaEventRecord(stop); + + // Block CPU execution until the event "stop" is recorded + cudaEventSynchronize(stop); + + //Print the time taken in milliseconds + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("The total time taken is %f milliseconds\n", milliseconds); + + cudaMemcpy(h_rev,d_rev1,ARRAY_BYTES,cudaMemcpyDeviceToHost); + + cudaFree(d_a); + cudaFree(d_rev); + cudaFree(d_rev1); + + // for (int i=0;i +#include +#include + +typedef float2 Complex; + +#define THREADS 32 +#define MAX_NO_OF_THREADS_PER_BLOCK 1024 + +const int TILE_DIM = 32; +const int BLOCK_ROWS = 8; + +// TODO: Make sure the implementations works for non square matrices +const long long ARRAY_SIZE = 1024; +const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex); + +// Bit reversal re-ordering, first step of FFT +__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) +{ + int row_id = blockIdx.y; + int col_id = threadIdx.x; + int row_size = ARRAY_SIZE; + int rev = __brev(col_id) >> (32-s); + + // d_rev[row_id][rev] = d_a[row_id][col_id] + d_rev[row_id * row_size + rev] = d_a[row_id * row_size + col_id]; +} + +// Optimized kernel for calculating FFT of all rows simultaneously + +__global__ void fft_2d(Complex *A, int s) +{ + int row_id = blockIdx.y; + int col_id = threadIdx.x; + int row_size = ARRAY_SIZE; + + // Shared memory allocated indepedent to each block + __shared__ Complex Work_Array[ARRAY_SIZE]; + + // Work_Array[col_id] = A[row_id][col_id]; + Work_Array[col_id].x = A[(row_id * row_size) + col_id].x; + Work_Array[col_id].y = A[(row_id * row_size) + col_id].y; + + // Work_Array[col_id + row_size/2] = A[row_id][col_id + row_size/2]; + Work_Array[col_id + row_size/2].x = A[(row_id * row_size) + (col_id + row_size/2)].x; + Work_Array[col_id + row_size/2].y = A[(row_id * row_size) + (col_id + row_size/2)].y; + + __syncthreads(); + + for(int i=1; i<=s; i++) + { + int m = 1 << i; + + int j = col_id * m; + + for(int k=0; k < m/2; k++) + { + if(j + k + m/2 < row_size) + { + Complex w, t, u; + + // w^k (w is root of unity) + w.x = __cosf((2*M_PI*k)/m); + w.y = -__sinf((2*M_PI*k)/m); + + // u = a[j+k] + u.x = Work_Array[j+k].x; + u.y = Work_Array[j+k].y; + + // t = w*a[j+k+m/2]; + t.x = w.x*Work_Array[j+k+m/2].x - w.y*Work_Array[j+k+m/2].y; + t.y = w.x*Work_Array[j+k+m/2].y + w.y*Work_Array[j+k+m/2].x; + + // a[j+k] = u+t; + Work_Array[j+k].x = u.x + t.x; + Work_Array[j+k].y = u.y + t.y; + + // a[j+k+m/2] = u-t; + Work_Array[j+k+m/2].x = u.x - t.x; + Work_Array[j+k+m/2].y = u.y - t.y; + + } + } + + __syncthreads(); + } + + // Copying data back from shared memory to global memory + + // A[row_id][col_id] = Work_Array[col_id]; + A[(row_id * row_size) + col_id].x = Work_Array[col_id].x; + A[(row_id * row_size) + col_id].y = Work_Array[col_id].y; + + // A[row_id][col_id + row_size/2] = Work_Array[col_id + row_size/2]; + A[(row_id * row_size) + (col_id + row_size/2)].x = Work_Array[col_id + row_size/2].x; + A[(row_id * row_size) + (col_id + row_size/2)].y = Work_Array[col_id + row_size/2].y; + +} + +__global__ void MatrixTranspose(Complex *d_out, Complex *d_in) +{ + __shared__ Complex tile[TILE_DIM][TILE_DIM+1]; + + int x = blockIdx.x * TILE_DIM + threadIdx.x; + int y = blockIdx.y * TILE_DIM + threadIdx.y; + int width = gridDim.x * TILE_DIM; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + tile[threadIdx.y+j][threadIdx.x] = d_in[(y+j)*width + x]; + + __syncthreads(); + + x = blockIdx.y * TILE_DIM + threadIdx.x; // transpose block offset + y = blockIdx.x * TILE_DIM + threadIdx.y; + + for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) + d_out[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j]; +} + +float magnitude(float2 a) +{ + return sqrt(a.x*a.x + a.y*a.y); +} + +int main() +{ + + //Measuring performance + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Creating files to write output to + FILE *fptr; + fptr = fopen("fft-2d-output.dat", "wr"); + + // Host arrays for input and output + Complex h_a[ARRAY_SIZE][ARRAY_SIZE]; + Complex h_rev[ARRAY_SIZE][ARRAY_SIZE]; + + // Input signal, complex part remains zero. + // Signal is of the form sin(2*M_PI*f*x/N) or cos(2*M_PI*f*x/N) + // N is sample size and is always a power of 2 + for(int i = 0; i < ARRAY_SIZE; i++) + { + for(int j = 0; j < ARRAY_SIZE; j++) + { + h_a[i][j].x = cos((6.0*M_PI*j)/ARRAY_SIZE); + h_a[i][j].y = 0.0; + } + } + + // No of bits required to represent N + int s = (int)ceil(log2(ARRAY_SIZE)); + + // Device arrays + Complex *d_a, *d_rev, *d_rev1; + + // Memory Allocation + cudaMalloc((void**) &d_a, ARRAY_BYTES * ARRAY_SIZE); + cudaMalloc((void**) &d_rev, ARRAY_BYTES * ARRAY_SIZE); + cudaMalloc((void**) &d_rev1, ARRAY_BYTES * ARRAY_SIZE); + + // Copy host array to device + cudaMemcpy(d_a, *h_a, ARRAY_BYTES * ARRAY_SIZE, cudaMemcpyHostToDevice); + + //Start of performance measurement + cudaEventRecord(start); + + // First step in FFT, bit-reverse reordering + // Swap an element at index 'i', with the element + // which is the bit-string reversal of 'i' + + bit_reverse_reorder<<>>(d_rev, d_a, s); + + cudaDeviceSynchronize(); + + // FFT driver code for row wise FFT + // Assigning every row of the 2d fft to a block of threads and calling N/2 threads for each block + fft_2d <<>> (d_rev, s); + + // TODO : Code for matrix transpose - DONE + + cudaMemset(d_rev1, 0, ARRAY_BYTES*ARRAY_SIZE); + + // Taking transpose of d_rev and storing in d_rev1 + MatrixTranspose<<>>(d_rev1, d_rev); + + // Column wise bit-reverse reordering of d_rev1, and storing in d_rev + bit_reverse_reorder<<>>(d_rev, d_rev1, s); + + cudaDeviceSynchronize(); + + // FFT driver code for column wise FFT + + fft_2d <<>> (d_rev, s); + + //End of performance measurement + cudaEventRecord(stop); + + //Block CPU execution until the event "stop" is recorded + cudaEventSynchronize(stop); + + //Print the time taken in milliseconds + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + printf("The total time taken is %f milliseconds\n", milliseconds); + + // Copy result array from device to host + cudaMemcpy(*h_rev, d_rev, ARRAY_BYTES * ARRAY_SIZE, cudaMemcpyDeviceToHost); + + // Writing output to files + fprintf(fptr, "[i][j]\t\ta.magn\t\ta.real\t\t\ta.img\n"); + for (int i = 0; i < ARRAY_SIZE; i++) + { + for(int j = 0; j < ARRAY_SIZE; j++) + { + fprintf(fptr,"[%d][%d]\t\t%f\t\t%f\t\t%f\n", i, j, magnitude(h_rev[i][j]), h_rev[i][j].x, h_rev[i][j].y); + } + } + + // Free allocated device memory + cudaFree(d_a); + cudaFree(d_rev); + cudaFree(d_rev1); + + return 0; +} \ No newline at end of file diff --git a/README.md b/README.md index 03ccb35..8e0233b 100644 --- a/README.md +++ b/README.md @@ -1 +1,5 @@ # Parallelizing-FFT + + This project is an parallelized implementation of Fast Fourier + Transform (specifically Cooley Tukey algorithm) using CUDA. + diff --git a/utils/1kplus250.wav b/utils/1kplus250.wav new file mode 100644 index 0000000..6814b83 Binary files /dev/null and b/utils/1kplus250.wav differ