Skip to content
This repository was archived by the owner on Mar 14, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
79332d1
Revert "Revert "Added bit-reversal reorder""
gnarayang Dec 9, 2019
c191a04
Added naive parallelization of fft
Shashwatha-Mitra Dec 9, 2019
d14baad
Merge pull request #5 from Shashwatha-Mitra/development
gnarayang Dec 9, 2019
351f0e7
Made fft parallelization scalable
Shashwatha-Mitra Dec 10, 2019
d7952bf
Merge pull request #6 from Shashwatha-Mitra/development
gnarayang Dec 11, 2019
3e766f7
Added serialized implementation of fft
Shashwatha-Mitra Dec 11, 2019
e1bf711
Added first optimization
Shashwatha-Mitra Dec 11, 2019
c2a8f8d
Merge pull request #7 from Shashwatha-Mitra/development
gnarayang Dec 12, 2019
b22ae2d
Add code to write output to files
gnarayang Dec 13, 2019
4077856
Added extra statements to print
gnarayang Dec 14, 2019
d5bf8cb
Merge pull request #8 from gnarayang/development
Shashwatha-Mitra Dec 16, 2019
9b70e7b
Update README.md
Shashwatha-Mitra Dec 16, 2019
3091c27
Update README.md
Shashwatha-Mitra Dec 19, 2019
526e73c
Added performance metrics
Shashwatha-Mitra Dec 19, 2019
2540fcc
Merge pull request #11 from Shashwatha-Mitra/development
Shashwatha-Mitra Dec 19, 2019
9c5ac76
Removed duplicate cudaMemcpy statements
Shashwatha-Mitra Dec 19, 2019
d39894d
Change repository structure
gnarayang Dec 25, 2019
1ac1390
Add naive implementation of 2D FFT
gnarayang Jan 14, 2020
4cd6546
Merge pull request #12 from gnarayang/development
Shashwatha-Mitra Jan 17, 2020
7faecda
Add optimization for row-wise FFT and transpose
niranjansy Mar 3, 2020
f2d0951
Merge pull request #13 from niranjansy/development
Shashwatha-Mitra Mar 4, 2020
2f4f08a
Add inverse 1D FFT
niranjansy Mar 6, 2020
b95f6bc
Merge pull request #14 from niranjansy/development
gnarayang Mar 7, 2020
d298b7f
1D-FFT: Add changes to process a wav file
appu234 Mar 8, 2020
06e88fe
utils: Add file to be processed
appu234 Mar 8, 2020
7796124
1D-FFT: Add code to take in command line arguments
appu234 Mar 8, 2020
e17a911
Merge pull request #15 from appu234/development
Shashwatha-Mitra Mar 8, 2020
424f128
Frequency filter
Shashwatha-Mitra Mar 8, 2020
4d1a203
Fixed bug
Shashwatha-Mitra Mar 8, 2020
46e168d
Merge pull request #16 from Shashwatha-Mitra/development
Shashwatha-Mitra Mar 8, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
191 changes: 191 additions & 0 deletions 1D-FFT/fft-opt1.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
// One of the possible optimizations to the implementation in fft.cu
//
// The algorithm is as follows:
//
// for (s = 0 to log2(N)) ........ loop1
// m = 2^i;
// statements
// for (j=0;j<N;j+=m) ....... loop2
// for (k=0 to m/2) ..... loop3
// statements
//
// The second and third loop are complementary to each other in the sense that
// while loop2 runs for n/m values of j, loop3 runs for m/2 values of k. With
// larger values of m, loop3 does more work, while for smaller values of m, loop2
// does more work, so in this optimization, we plan to separate out the two cases
// and achieve parallelization of both outer and inner loops
//
// The point of separation needs to be experimentally arrived at, i.e, we need to
// test out different cases and choose the best of the lot. As a first commit in
// the optimization, we have adopted a naive approach and check the value of m/2.
// If the value of m/2 is less than the maximum number of threads that can be
// spawned by a block, we parallelize loop2, and loop3 otherwise.
//
//

#include <stdio.h>
#include <cmath>
#include <cuda.h>

typedef float2 Complex;

#define THREADS 32
#define MAX_NO_OF_THREADS_PER_BLOCK 1024

const long long ARRAY_SIZE = 65536;
const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);

// Bit reversal re-ordering, first step of FFT
__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
int rev = __brev(id) >> (32-s);

if(id < ARRAY_SIZE)
d_rev[rev] = d_a[id];
}

// Work of the innermost loop, common to both parallelization
__device__ void inplace_fft(Complex *a, int j, int k, int m){

if (j+k+m/2 < ARRAY_SIZE){

Complex w, t, u;

// w^k (w is root of unity)
w.x = __cosf((2*M_PI*k)/m);
w.y = -__sinf((2*M_PI*k)/m);

// u = a[j+k]
u.x = a[j+k].x;
u.y = a[j+k].y;

// t = w*a[j+k+m/2];
t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y;
t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x;

// a[j+k] = u+t;
a[j+k].x = u.x + t.x;
a[j+k].y = u.y + t.y;

// a[j+k+m/2] = u-t;
a[j+k+m/2].x = u.x - t.x;
a[j+k+m/2].y = u.y - t.y;

}
}

// Parallelization of loop2
__global__ void fft_outer(Complex *a, int m){
int j = (blockIdx.x * blockDim.x + threadIdx.x)*m;
if (j < ARRAY_SIZE){
for (int k=0;k<m/2;k++){
inplace_fft(a,j,k,m);
}
}
}

// Parallelization of loop3
__global__ void fft_inner(Complex *a, int j, int m){
int k = (blockIdx.x * blockDim.x + threadIdx.x);
if (k < m/2)
inplace_fft(a,j,k,m);
}

float magnitude(float2 a)
{
return sqrt(a.x*a.x + a.y*a.y);
}

int main() {

//Measuring performance
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

// Creating files to write output to
FILE *fptr;
fptr = fopen("fft-opt1-output.dat", "wr");

// Host arrays for input and output
Complex h_a[ARRAY_SIZE];
Complex h_rev[ARRAY_SIZE];

// Input signal, complex part remains zero.
// Signal is of the form sin(2*M_PI*f*x/N) or cos(2*M_PI*f*x/N)
// N is sample size and is always a power of 2
for(int i = 0; i < ARRAY_SIZE; i++) {
h_a[i].x = cos((6.0*M_PI*i)/ARRAY_SIZE);
h_a[i].y = 0.0;
}

// No of bits required to represent N
int s = (int)ceil(log2(ARRAY_SIZE));

// Device arrays
Complex *d_a, *d_rev;

// Memory Allocation
cudaMalloc((void**) &d_a, ARRAY_BYTES);
cudaMalloc((void**) &d_rev, ARRAY_BYTES);

// Copy host array to device
cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);

//Start of performance measurement
cudaEventRecord(start);

// First step in FFT, bit-reverse reordering
// Swap an element at index 'i', with the element
// which is the bit-string reversal of 'i'
bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s);

cudaDeviceSynchronize();

// FFT driver code
for (int i=1;i<=s;i++){

// m = 2^i
int m = 1 << i;

if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){

fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m);

} else {

for (int j=0;j<ARRAY_SIZE;j+=m){

fft_inner<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev,j,m);

}
}
}

//End of performance measurement
cudaEventRecord(stop);

//Block CPU execution until the event "stop" is recorded
cudaEventSynchronize(stop);

//Print the time taken in milliseconds
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
printf("The total time taken is %f milliseconds\n", milliseconds);

// Copy result array from device to host
cudaMemcpy(h_rev,d_rev,ARRAY_BYTES,cudaMemcpyDeviceToHost);

// Writing output to files
fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n");
for (int i = 0; i < ARRAY_SIZE; i++)
{
fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i]), h_rev[i].x, h_rev[i].y);
}

// Free allocated device memory
cudaFree(d_a);
cudaFree(d_rev);

return 0;
}
109 changes: 109 additions & 0 deletions 1D-FFT/fft-serialized.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <unistd.h>
#include <time.h>

#ifndef M_PI
#define M_PI 3.14159265358979323846
#endif

#define ARRAY_SIZE 131072

typedef struct Complex{
float re,im;
} Complex;

const long long ARRAY_BYTES = ARRAY_SIZE*sizeof(Complex);

int bit_reverse(int n, int s){
int rev=0, count=s;
while(n){
rev = rev << 1;
rev |= n & 1;
n = n >> 1;
count--;
}
rev = rev << count;
return rev;
}

void bit_reverse_reorder(Complex *a, Complex *rev, int s){
for(int i=0;i<ARRAY_SIZE;i++){
rev[bit_reverse(i,s)] = a[i];
}
}

void fft(Complex *a, int s){
for(int i=1;i<=s;i++){

int m = 1 << i;
Complex w_m;
w_m.re = cosf((2*M_PI)/m);
w_m.im = -sinf((2*M_PI)/m);

for (int j=0;j<ARRAY_SIZE;j+=m){

Complex w;
w.re = 1;
w.im = 0;

for (int k=0;k<m/2;k++){

Complex t, u;
float w_x = w.re, w_y = w.im;

t.re = w_x*a[j+k+m/2].re - w_y*a[j+k+m/2].im;
t.im = w_x*a[j+k+m/2].im + w_y*a[j+k+m/2].re;

u.re = a[j+k].re;
u.im = a[j+k].im;

a[j+k].re = u.re + t.re;
a[j+k].im = u.im + t.im;

a[j+k+m/2].re = u.re - t.re;
a[j+k+m/2].im = u.im - t.im;

w.re = w_x*w_m.re - w_y*w_m.im;
w.im = w_x*w_m.im + w_y*w_m.re;
}
}
}
}

float magnitude(Complex a)
{
return sqrt(a.re*a.re + a.im*a.im);
}

void main(){
clock_t start, end;
FILE *fptr;
fptr = fopen("fft-serialized-output.dat", "wr");
if (fptr == NULL) {
printf("Error!");
exit(1);
}
Complex *a, *rev;
int s = (int)ceil(log2(ARRAY_SIZE));
a = (Complex *)malloc(ARRAY_BYTES);
rev = (Complex *)malloc(ARRAY_BYTES);
for (int i=0;i<ARRAY_SIZE;i++){
a[i].re = sinf((12*M_PI*i)/ARRAY_SIZE);
a[i].im = 0;
}
start = clock();
bit_reverse_reorder(a,rev,s);
fft(rev,s);
end = clock();
double time = ((double)(end - start))/1000;
printf("Time in ms: %lf\n", (double)time);
fprintf(fptr, "i\t\trev.magn\t\trev.real\t\trev.img\t\ta.magn\t\ta.real\t\ta.img\n");
for (int i = 0; i < ARRAY_SIZE; i++)
{
fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\t\t%f\t\t%f\t\t%f\n", i, magnitude(rev[i]), rev[i].re, rev[i].im, magnitude(a[i]), a[i].re, a[i].im);
}

// printf ("%f %f %f %f\n",rev[6].re, rev[6].im, rev[ARRAY_SIZE-6].re, rev[ARRAY_SIZE-6].im);
}
Loading