diff --git a/1D-FFT/fft-opt1.cu b/1D-FFT/fft-opt1.cu
new file mode 100644
index 0000000..333f50e
--- /dev/null
+++ b/1D-FFT/fft-opt1.cu
@@ -0,0 +1,191 @@
+// One of the possible optimizations to the implementation in fft.cu
+// 
+// The algorithm is as follows:
+//
+// for (s = 0 to log2(N)) ........ loop1
+//      m = 2^i;
+//      statements
+//      for (j=0;j<N;j+=m) ....... loop2
+//          for (k=0 to m/2) ..... loop3
+//              statements
+//
+// The second and third loop are complementary to each other in the sense that 
+// while loop2 runs for n/m values of j, loop3 runs for m/2 values of k. With 
+// larger values of m, loop3 does more work, while for smaller values of m, loop2
+// does more work, so in this optimization, we plan to separate out the two cases
+// and achieve parallelization of both outer and inner loops
+//
+// The point of separation needs to be experimentally arrived at, i.e, we need to 
+// test out different cases and choose the best of the lot. As a first commit in
+// the optimization, we have adopted a naive approach and check the value of m/2.
+// If the value of m/2 is less than the maximum number of threads that can be 
+// spawned by a block, we parallelize loop2, and loop3 otherwise.
+//
+//
+
+#include <stdio.h>
+#include <cmath>
+#include <cuda.h>
+
+typedef float2 Complex;
+
+#define THREADS 32
+#define MAX_NO_OF_THREADS_PER_BLOCK 1024
+
+const long long ARRAY_SIZE = 65536; 
+const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);
+
+// Bit reversal re-ordering, first step of FFT
+__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) {
+	  int id = blockIdx.x * blockDim.x + threadIdx.x;
+    int rev = __brev(id) >> (32-s);
+
+    if(id < ARRAY_SIZE)
+        d_rev[rev] = d_a[id];
+}
+
+// Work of the innermost loop, common to both parallelization
+__device__ void inplace_fft(Complex *a, int j, int k, int m){
+    
+    if (j+k+m/2 < ARRAY_SIZE){
+        
+        Complex w, t, u;
+
+        // w^k (w is root of unity)
+        w.x = __cosf((2*M_PI*k)/m);
+        w.y = -__sinf((2*M_PI*k)/m);
+
+        // u = a[j+k]
+        u.x = a[j+k].x;
+        u.y = a[j+k].y;
+
+        // t = w*a[j+k+m/2];
+        t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y;
+        t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x;
+
+        // a[j+k] = u+t;
+        a[j+k].x = u.x + t.x;
+        a[j+k].y = u.y + t.y;
+
+        // a[j+k+m/2] = u-t;
+        a[j+k+m/2].x = u.x - t.x;
+        a[j+k+m/2].y = u.y - t.y;
+
+    }
+}
+
+// Parallelization of loop2
+__global__ void fft_outer(Complex *a, int m){
+    int j = (blockIdx.x * blockDim.x + threadIdx.x)*m;
+    if (j < ARRAY_SIZE){
+        for (int k=0;k<m/2;k++){
+            inplace_fft(a,j,k,m);
+        }
+    }    
+}
+
+// Parallelization of loop3
+__global__ void fft_inner(Complex *a, int j, int m){
+    int k = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (k < m/2)
+        inplace_fft(a,j,k,m);
+}
+
+float magnitude(float2 a)
+{
+    return sqrt(a.x*a.x + a.y*a.y);
+}
+
+int main() {
+
+    //Measuring performance
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    // Creating files to write output to
+    FILE *fptr;
+    fptr = fopen("fft-opt1-output.dat", "wr");
+
+    // Host arrays for input and output
+    Complex h_a[ARRAY_SIZE];
+    Complex h_rev[ARRAY_SIZE];
+
+    // Input signal, complex part remains zero.
+    // Signal is of the form sin(2*M_PI*f*x/N) or cos(2*M_PI*f*x/N)
+    // N is sample size and is always a power of 2 
+    for(int i = 0; i < ARRAY_SIZE; i++) {
+    	h_a[i].x = cos((6.0*M_PI*i)/ARRAY_SIZE);
+        h_a[i].y = 0.0;
+    }
+        
+    // No of bits required to represent N
+    int s = (int)ceil(log2(ARRAY_SIZE));
+
+    // Device arrays
+    Complex *d_a, *d_rev;
+
+    // Memory Allocation
+    cudaMalloc((void**) &d_a, ARRAY_BYTES);
+    cudaMalloc((void**) &d_rev, ARRAY_BYTES);
+
+    // Copy host array to device
+    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
+
+    //Start of performance measurement
+    cudaEventRecord(start);
+
+    // First step in FFT, bit-reverse reordering
+    // Swap an element at index 'i', with the element 
+    // which is the bit-string reversal of 'i' 
+    bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s);
+    
+    cudaDeviceSynchronize();
+
+    // FFT driver code
+    for (int i=1;i<=s;i++){
+
+        // m = 2^i
+        int m = 1 << i;
+
+        if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){
+
+            fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m);    
+
+        } else {
+            
+            for (int j=0;j<ARRAY_SIZE;j+=m){
+
+                fft_inner<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev,j,m);
+
+            }
+        }
+    }
+
+    //End of performance measurement
+    cudaEventRecord(stop);
+
+    //Block CPU execution until the event "stop" is recorded
+    cudaEventSynchronize(stop);
+
+    //Print the time taken in milliseconds
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("The total time taken is %f milliseconds\n", milliseconds);
+
+    // Copy result array from device to host
+    cudaMemcpy(h_rev,d_rev,ARRAY_BYTES,cudaMemcpyDeviceToHost);
+
+    // Writing output to files
+    fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n");
+    for (int i = 0; i < ARRAY_SIZE; i++)
+    {
+        fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i]), h_rev[i].x, h_rev[i].y);
+    }
+
+    // Free allocated device memory
+    cudaFree(d_a);
+    cudaFree(d_rev);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/1D-FFT/fft-serialized.c b/1D-FFT/fft-serialized.c
new file mode 100644
index 0000000..e7dd538
--- /dev/null
+++ b/1D-FFT/fft-serialized.c
@@ -0,0 +1,109 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <unistd.h> 
+#include <time.h>
+
+#ifndef M_PI
+    #define M_PI 3.14159265358979323846
+#endif
+
+#define ARRAY_SIZE 131072
+
+typedef struct Complex{
+    float re,im;
+} Complex;
+
+const long long ARRAY_BYTES = ARRAY_SIZE*sizeof(Complex);
+
+int bit_reverse(int n, int s){
+    int rev=0, count=s;
+    while(n){
+        rev = rev << 1;
+        rev |= n & 1;
+        n = n >> 1;
+        count--;
+    }
+    rev = rev << count;
+    return rev;
+}
+
+void bit_reverse_reorder(Complex *a, Complex *rev, int s){
+    for(int i=0;i<ARRAY_SIZE;i++){        
+        rev[bit_reverse(i,s)] = a[i];
+    }
+}
+
+void fft(Complex *a, int s){
+    for(int i=1;i<=s;i++){
+        
+        int m = 1 << i;
+        Complex w_m;
+        w_m.re = cosf((2*M_PI)/m);
+        w_m.im = -sinf((2*M_PI)/m); 
+
+        for (int j=0;j<ARRAY_SIZE;j+=m){
+            
+            Complex w;
+            w.re = 1;
+            w.im = 0;
+
+            for (int k=0;k<m/2;k++){
+
+                Complex t, u;
+                float w_x = w.re, w_y = w.im;
+
+                t.re = w_x*a[j+k+m/2].re - w_y*a[j+k+m/2].im;
+                t.im = w_x*a[j+k+m/2].im + w_y*a[j+k+m/2].re;
+
+                u.re = a[j+k].re;
+                u.im = a[j+k].im;
+
+                a[j+k].re = u.re + t.re;
+                a[j+k].im = u.im + t.im;
+
+                a[j+k+m/2].re = u.re - t.re;
+                a[j+k+m/2].im = u.im - t.im;
+
+                w.re = w_x*w_m.re - w_y*w_m.im;
+                w.im = w_x*w_m.im + w_y*w_m.re;                
+            }
+        }
+    }
+}
+
+float magnitude(Complex a)
+{
+    return sqrt(a.re*a.re + a.im*a.im);
+}
+
+void main(){
+    clock_t start, end;
+    FILE *fptr;
+    fptr = fopen("fft-serialized-output.dat", "wr");
+    if (fptr == NULL) {
+        printf("Error!");
+        exit(1);
+    }
+    Complex *a, *rev;
+    int s = (int)ceil(log2(ARRAY_SIZE));
+    a = (Complex *)malloc(ARRAY_BYTES);
+    rev = (Complex *)malloc(ARRAY_BYTES);
+    for (int i=0;i<ARRAY_SIZE;i++){
+        a[i].re = sinf((12*M_PI*i)/ARRAY_SIZE);
+        a[i].im = 0;
+    }
+    start = clock();
+    bit_reverse_reorder(a,rev,s);
+    fft(rev,s);
+    end = clock();
+    double time = ((double)(end - start))/1000;
+    printf("Time in ms: %lf\n", (double)time);
+    fprintf(fptr, "i\t\trev.magn\t\trev.real\t\trev.img\t\ta.magn\t\ta.real\t\ta.img\n");
+    for (int i = 0; i < ARRAY_SIZE; i++)
+    {
+        fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\t\t%f\t\t%f\t\t%f\n", i, magnitude(rev[i]), rev[i].re, rev[i].im, magnitude(a[i]), a[i].re, a[i].im);
+    }
+    
+    // printf ("%f %f %f %f\n",rev[6].re, rev[6].im, rev[ARRAY_SIZE-6].re, rev[ARRAY_SIZE-6].im);
+}
\ No newline at end of file
diff --git a/1D-FFT/fft.cu b/1D-FFT/fft.cu
new file mode 100644
index 0000000..268e09a
--- /dev/null
+++ b/1D-FFT/fft.cu
@@ -0,0 +1,139 @@
+// Include header files
+#include <bits/stdc++.h>
+#include <cuda.h>
+#include <cmath>
+
+#define ll long long int
+#define THREADS 32
+
+typedef float2 Complex;
+
+const long long ARRAY_SIZE = 1024;
+const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);
+
+// Parallelized reordering (Doesn't this count as pre-processing?)
+__global__ void bit_reverse_reorder (Complex *d_rev, Complex *d_a, int s){
+    int id = (blockIdx.x * blockDim.x) + threadIdx.x;
+    int rev = __brev(id) >> (32-s);
+    if (id < ARRAY_SIZE)
+        d_rev[rev] = d_a[id];
+}
+
+// FFT driver kernel code
+__global__ void fft(Complex *a, int j, int m){
+    int k = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if(k < m/2 && j+k+m/2 < ARRAY_SIZE){
+        
+        Complex w, t, u;
+
+        // w^k (w is root of unity)
+        w.x = __cosf((2*M_PI*k)/m);
+        w.y = -__sinf((2*M_PI*k)/m);
+
+        // u = a[j+k]
+        u.x = a[j+k].x;
+        u.y = a[j+k].y;
+
+        // t = w*a[j+k+m/2];
+        t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y;
+        t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x;
+
+        // a[j+k] = u+t;
+        a[j+k].x = u.x + t.x;
+        a[j+k].y = u.y + t.y;
+
+        // a[j+k+m/2] = u-t;
+        a[j+k+m/2].x = u.x - t.x;
+        a[j+k+m/2].y = u.y - t.y;
+
+    }
+}
+
+float magnitude(float2 a)
+{
+    return sqrt(a.x*a.x + a.y*a.y);
+}
+
+int main(int argc, char *argv[]) {
+
+    //Measuring performance
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    // Creating files to write output to
+    FILE *fptr;
+    fptr = fopen("fft-output.dat", "wr");
+
+    //Creating Complex arrays for data 
+    Complex h_a[ARRAY_SIZE], h_rev[ARRAY_SIZE]; 
+    Complex *d_a, *d_rev;
+
+    // Input signal (of the form sin((2*M_PI*f*x)/N)) where N is the sample size
+    // imaginary part of the signal by default is 0
+    for(int i = 0; i < ARRAY_SIZE; i++) {
+    	h_a[i].x = sin((10*M_PI*i)/ARRAY_SIZE);
+        h_a[i].y = 0.0;
+    }
+        
+    // No. of bits in the sample size, used for bit reversal reordering
+    int s = (int)ceil(log2(ARRAY_SIZE));
+
+    //Allocate memory for the device arrays
+    cudaMalloc((void**) &d_a, ARRAY_BYTES);
+    cudaMalloc((void**) &d_rev, ARRAY_BYTES);
+
+    //Copy all elements of sample array from host to device
+    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
+
+    //Start of performance measurement
+    cudaEventRecord(start);
+
+    //Reorder the sample as first step of FFT
+    bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s);
+
+    //Synchronise devices before jumping to fft
+    cudaDeviceSynchronize();
+
+    // Naive fft parallelization (TODO: improve upon the efficiency)
+    for (int i=0;i<=s;i++){
+
+        int m = 1 << i;
+        
+        for(int j=0;j<ARRAY_SIZE;j+=m){
+
+            // Performing in-place fft
+            fft<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev,j,m);    
+        
+        }    
+    }
+
+    //End of performance measurement
+    cudaEventRecord(stop);
+
+    //Block CPU execution until the event "stop" is recorded
+    cudaEventSynchronize(stop);
+
+    //Print the time taken in milliseconds.
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("The total time taken is %f milliseconds\n", milliseconds);
+
+    // Copy result array to host
+    cudaMemcpy(h_rev, d_rev, ARRAY_BYTES, cudaMemcpyDeviceToHost);
+
+    // Writing output to files
+    fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n");
+    for (int i = 0; i < ARRAY_SIZE; i++)
+    {
+        fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i]), h_rev[i].x, h_rev[i].y);
+    }
+
+    // Free device memory
+    cudaFree(d_a);
+    cudaFree(d_rev);
+    
+    // TODO
+    // Use h_rev to plot magnitude (sqrt(h_rev[i].x^2 + h_rev[i].y^2)) vs frequency (i)
+}
\ No newline at end of file
diff --git a/1D-FFT/inv_fft.cu b/1D-FFT/inv_fft.cu
new file mode 100644
index 0000000..c8f4214
--- /dev/null
+++ b/1D-FFT/inv_fft.cu
@@ -0,0 +1,242 @@
+#include <stdio.h>
+#include <cmath>
+#include <cuda.h>
+
+typedef float2 Complex;
+
+#define THREADS 32
+#define MAX_NO_OF_THREADS_PER_BLOCK 1024
+
+long long ARRAY_SIZE; 
+long long ARRAY_BYTES;
+
+__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s, long long ARRAY_SIZE) {
+	int id = blockIdx.x * blockDim.x + threadIdx.x;
+    int rev = __brev(id) >> (32-s);
+
+    if(id < ARRAY_SIZE)
+        d_rev[rev] = d_a[id];
+}
+
+__global__ void swap_real_and_imaginary(Complex *d_rev, long long ARRAY_SIZE) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id < ARRAY_SIZE) {
+        float temp = d_rev[id].x;
+        d_rev[id].x = d_rev[id].y;
+        d_rev[id].y = temp;
+    }
+}
+
+__global__ void clip(Complex *d_rev, long long cut_off_frequency, long long ARRAY_SIZE) {
+    int id = blockIdx.x * blockDim.x + threadIdx.x;
+    if (id >= cut_off_frequency && id <= (ARRAY_SIZE - cut_off_frequency)) {
+        d_rev[id].x = 0.0f;
+        d_rev[id].y = 0.0f;
+    }
+}
+
+__device__ void inplace_fft(Complex *a, int j, int k, int m, long long ARRAY_SIZE){
+    
+    if (j+k+m/2 < ARRAY_SIZE){
+        
+        Complex w, t, u;
+
+        // w^k (w is root of unity)
+        w.x = __cosf((2*M_PI*k)/m);
+        w.y = -__sinf((2*M_PI*k)/m);
+
+        // u = a[j+k]
+        u.x = a[j+k].x;
+        u.y = a[j+k].y;
+
+        // t = w*a[j+k+m/2];
+        t.x = w.x*a[j+k+m/2].x - w.y*a[j+k+m/2].y;
+        t.y = w.x*a[j+k+m/2].y + w.y*a[j+k+m/2].x;
+
+        // a[j+k] = u+t;
+        a[j+k].x = u.x + t.x;
+        a[j+k].y = u.y + t.y;
+
+        // a[j+k+m/2] = u-t;
+        a[j+k+m/2].x = u.x - t.x;
+        a[j+k+m/2].y = u.y - t.y;
+
+    }
+}
+
+__global__ void fft_outer(Complex *a, int m, long long ARRAY_SIZE){
+    int j = (blockIdx.x * blockDim.x + threadIdx.x)*m;
+    if (j < ARRAY_SIZE){
+        for (int k=0;k<m/2;k++){
+            inplace_fft(a,j,k,m,ARRAY_SIZE);
+        }
+    }    
+}
+
+__global__ void fft_inner(Complex *a, int j, int m, long long ARRAY_SIZE){
+    int k = (blockIdx.x * blockDim.x + threadIdx.x);
+    if (k < m/2)
+        inplace_fft(a,j,k,m,ARRAY_SIZE);
+}
+
+float magnitude(float2 a)
+{
+    return sqrt(a.x*a.x + a.y*a.y);
+}
+
+int to_int(char str[])
+{
+    int cut_off_frequency = 0;
+    int i = 0;
+    while(str[i] != '\0')
+    {
+        cut_off_frequency += str[i] - 48;
+        cut_off_frequency *= 10;
+
+        i++;
+    }
+
+    return cut_off_frequency/10;
+}
+
+int main(int argc, char *argv[]) 
+{
+    int cut_off_frequency;
+
+    if(argc >= 2)
+        cut_off_frequency = to_int(argv[1]);
+    
+    else
+        cut_off_frequency = 1000;
+
+    // printf("%d\n", cut_off_frequency);
+    
+
+    //Measuring performance
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    //Creating file to read input from
+    FILE *input;
+    input = fopen("../utils/input.dat", "r");
+
+    int fs;
+
+    fscanf(input, "%d", &fs);
+
+    fscanf(input, "%Ld", &ARRAY_SIZE);
+
+    ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);
+
+    // Creating files to write output to
+    FILE *fptr;
+    FILE *output;
+
+    fptr = fopen("../utils/fft-opt1-output.dat", "wr");
+    output = fopen("../utils/convert_to_wav.dat", "wr");
+    
+    Complex *h_a = (Complex *)malloc(ARRAY_BYTES);
+    Complex *h_rev = (Complex *)malloc(ARRAY_BYTES);
+
+    for(int i = 0; i < ARRAY_SIZE; i++) 
+    {
+   	    fscanf(input, "%f", &h_a[i].x);
+        // h_a[i].x = sin((6.0*M_PI*i)/ARRAY_SIZE);
+        // printf("%f\n", h_a[i].x);
+        h_a[i].y = 0.0;
+    }
+    	
+    int s = (int)ceil(log2(ARRAY_SIZE));
+
+    Complex *d_a, *d_rev, *d_rev1;
+
+    cudaMalloc((void**) &d_a, ARRAY_BYTES);
+    cudaMalloc((void**) &d_rev, ARRAY_BYTES);
+    cudaMalloc((void**) &d_rev1, ARRAY_BYTES);
+
+    cudaMemcpy(d_a, h_a, ARRAY_BYTES, cudaMemcpyHostToDevice);
+
+    //Start of performance measurement
+    cudaEventRecord(start);
+
+    bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, d_a, s, ARRAY_SIZE);
+    
+    cudaDeviceSynchronize();
+
+    for (int i=1;i<=s;i++){
+        int m = 1 << i;
+        if (m/2 < MAX_NO_OF_THREADS_PER_BLOCK){
+            fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev,m,ARRAY_SIZE);
+        } else {
+            for (int j=0;j<ARRAY_SIZE;j+=m){
+                fft_inner<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev,j,m,ARRAY_SIZE);
+            }
+        }
+    }
+
+    cudaDeviceSynchronize();
+
+    // Clipping
+    clip<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, cut_off_frequency, ARRAY_SIZE);
+
+    // Beginning of inverse FFT
+
+    swap_real_and_imaginary<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev, ARRAY_SIZE);
+
+    cudaDeviceSynchronize();
+
+    bit_reverse_reorder<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev1, d_rev, s, ARRAY_SIZE);
+
+    cudaDeviceSynchronize();
+    
+    for (int i=1;i<=s;i++)
+    {
+        int m = 1 << i;
+        if (m < sqrt(ARRAY_SIZE / 4))
+        {
+            fft_outer<<<((ARRAY_SIZE/m)+THREADS-1)/THREADS,THREADS>>>(d_rev1,m,ARRAY_SIZE);
+        } 
+        else 
+        {
+            for (int j=0;j<ARRAY_SIZE;j+=m)
+            {
+                fft_inner<<<((m/2)+THREADS-1)/THREADS,THREADS>>>(d_rev1,j,m,ARRAY_SIZE);
+            }
+        }
+    }
+
+    swap_real_and_imaginary<<<(ARRAY_SIZE+THREADS-1)/THREADS, THREADS>>>(d_rev1, ARRAY_SIZE);
+
+    // End of performance measurement
+    cudaEventRecord(stop);
+
+    // Block CPU execution until the event "stop" is recorded
+    cudaEventSynchronize(stop);
+
+    //Print the time taken in milliseconds
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("The total time taken is %f milliseconds\n", milliseconds);
+
+    cudaMemcpy(h_rev,d_rev1,ARRAY_BYTES,cudaMemcpyDeviceToHost);
+
+    cudaFree(d_a);
+    cudaFree(d_rev);
+    cudaFree(d_rev1);
+    
+    // for (int i=0;i<ARRAY_SIZE;i++){
+    //     printf ("%f %f\n", h_rev[i].x/ARRAY_SIZE, h_rev[i].y/ARRAY_SIZE);
+    // }
+
+    // Writing output to files
+    fprintf(output, "%d\n", fs);
+    fprintf(fptr, "i\t\ta.magn\t\ta.real\t\t\ta.img\n");
+    for (int i = 0; i < ARRAY_SIZE; i++)
+    {
+        fprintf(output, "%Ld\n", (long long)h_rev[i].x/ARRAY_SIZE);
+        fprintf(fptr,"%d\t\t%f\t\t%f\t\t%f\n", i, magnitude(h_rev[i])/ARRAY_SIZE, h_rev[i].x/ARRAY_SIZE, h_rev[i].y/ARRAY_SIZE);
+    }
+
+    return 0;
+}
\ No newline at end of file
diff --git a/1D-FFT/postprocessing.py b/1D-FFT/postprocessing.py
new file mode 100644
index 0000000..cf774da
--- /dev/null
+++ b/1D-FFT/postprocessing.py
@@ -0,0 +1,16 @@
+import numpy
+from scipy.io import wavfile
+
+file1 = open("../utils/convert_to_wav.dat", "r+")
+
+# Raw data has all the data from the file as a list
+raw_data = file1.readlines()
+# The first element of the file is the sampling rate
+fs = int(raw_data[0][0:-1])
+# The rest of the data is amplitudes
+data = []
+for i in range (1, len(raw_data)):
+    data.append(int(raw_data[i][0:-1]))
+data = numpy.array(data, numpy.int16)
+
+wavfile.write('../utils/proccessed_output.wav', fs, data)
diff --git a/1D-FFT/preprocessing.py b/1D-FFT/preprocessing.py
new file mode 100644
index 0000000..0952a9a
--- /dev/null
+++ b/1D-FFT/preprocessing.py
@@ -0,0 +1,14 @@
+import sys
+import numpy
+from scipy.io import wavfile
+
+argument = sys.argv
+print sys.argv[1]
+fs, data = wavfile.read(str(sys.argv[1]))
+
+file1 = open("../utils/input.dat", "w+")
+file1.write(str(fs) + '\n')
+file1.write(str(1 << int(numpy.log2(data.size))) + '\n')
+
+for x in range(0, 1 << int(numpy.log2(data.size))):
+    file1.write(str(data[x]) + '\n')
\ No newline at end of file
diff --git a/1D-FFT/run.sh b/1D-FFT/run.sh
new file mode 100755
index 0000000..6bcb5f3
--- /dev/null
+++ b/1D-FFT/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# echo "Argument give is $1"
+
+python preprocessing.py $1
+
+nvcc inv_fft.cu
+./a.out $2
+
+python postprocessing.py
\ No newline at end of file
diff --git a/2D-FFT/fft.cu b/2D-FFT/fft.cu
new file mode 100644
index 0000000..49d28ef
--- /dev/null
+++ b/2D-FFT/fft.cu
@@ -0,0 +1,228 @@
+#include <stdio.h>
+#include <cmath>
+#include <cuda.h>
+
+typedef float2 Complex;
+
+#define THREADS 32
+#define MAX_NO_OF_THREADS_PER_BLOCK 1024
+
+const int TILE_DIM = 32;
+const int BLOCK_ROWS = 8;
+
+// TODO: Make sure the implementations works for non square matrices
+const long long ARRAY_SIZE = 1024; 
+const long long ARRAY_BYTES = ARRAY_SIZE * sizeof(Complex);
+
+// Bit reversal re-ordering, first step of FFT
+__global__ void bit_reverse_reorder(Complex *d_rev, Complex *d_a, int s) 
+{
+    int row_id = blockIdx.y;
+    int col_id = threadIdx.x;
+    int row_size = ARRAY_SIZE;
+    int rev = __brev(col_id) >> (32-s);
+    
+    // d_rev[row_id][rev] = d_a[row_id][col_id]
+    d_rev[row_id * row_size + rev] = d_a[row_id * row_size + col_id];
+}
+
+// Optimized kernel for calculating FFT of all rows simultaneously
+
+__global__ void fft_2d(Complex *A, int s)
+{
+    int row_id = blockIdx.y;
+    int col_id = threadIdx.x;
+    int row_size = ARRAY_SIZE;
+
+    // Shared memory allocated indepedent to each block
+    __shared__ Complex Work_Array[ARRAY_SIZE];
+    
+    // Work_Array[col_id] = A[row_id][col_id];
+    Work_Array[col_id].x = A[(row_id * row_size) + col_id].x;
+    Work_Array[col_id].y = A[(row_id * row_size) + col_id].y;
+
+    // Work_Array[col_id + row_size/2] = A[row_id][col_id + row_size/2];
+    Work_Array[col_id + row_size/2].x = A[(row_id * row_size) + (col_id + row_size/2)].x;
+    Work_Array[col_id + row_size/2].y = A[(row_id * row_size) + (col_id + row_size/2)].y;
+
+    __syncthreads();
+
+    for(int i=1; i<=s; i++)
+    {
+        int m = 1 << i;
+
+        int j = col_id * m;
+
+        for(int k=0; k < m/2; k++)
+        {
+            if(j + k + m/2 < row_size)
+            {
+                Complex w, t, u;
+
+                // w^k (w is root of unity)
+                w.x = __cosf((2*M_PI*k)/m);
+                w.y = -__sinf((2*M_PI*k)/m);
+
+                // u = a[j+k]
+                u.x = Work_Array[j+k].x;
+                u.y = Work_Array[j+k].y;
+
+                // t = w*a[j+k+m/2];
+                t.x = w.x*Work_Array[j+k+m/2].x - w.y*Work_Array[j+k+m/2].y;
+                t.y = w.x*Work_Array[j+k+m/2].y + w.y*Work_Array[j+k+m/2].x;
+
+                // a[j+k] = u+t;
+                Work_Array[j+k].x = u.x + t.x;
+                Work_Array[j+k].y = u.y + t.y;
+
+                // a[j+k+m/2] = u-t;
+                Work_Array[j+k+m/2].x = u.x - t.x;
+                Work_Array[j+k+m/2].y = u.y - t.y;
+
+            }
+        }
+
+        __syncthreads();
+    }
+
+    // Copying data back from shared memory to global memory
+
+    // A[row_id][col_id] = Work_Array[col_id];
+    A[(row_id * row_size) + col_id].x = Work_Array[col_id].x;
+    A[(row_id * row_size) + col_id].y = Work_Array[col_id].y;
+
+    // A[row_id][col_id + row_size/2] =  Work_Array[col_id + row_size/2];
+    A[(row_id * row_size) + (col_id + row_size/2)].x =  Work_Array[col_id + row_size/2].x;
+    A[(row_id * row_size) + (col_id + row_size/2)].y =  Work_Array[col_id + row_size/2].y;
+
+}
+
+__global__ void MatrixTranspose(Complex *d_out, Complex *d_in)
+{
+    __shared__ Complex tile[TILE_DIM][TILE_DIM+1];
+    
+    int x = blockIdx.x * TILE_DIM + threadIdx.x;
+    int y = blockIdx.y * TILE_DIM + threadIdx.y;
+    int width = gridDim.x * TILE_DIM;
+
+    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+        tile[threadIdx.y+j][threadIdx.x] = d_in[(y+j)*width + x];
+
+    __syncthreads();
+
+    x = blockIdx.y * TILE_DIM + threadIdx.x;  // transpose block offset
+    y = blockIdx.x * TILE_DIM + threadIdx.y;
+
+    for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS)
+        d_out[(y+j)*width + x] = tile[threadIdx.x][threadIdx.y + j];
+}
+
+float magnitude(float2 a)
+{
+    return sqrt(a.x*a.x + a.y*a.y);
+}
+
+int main() 
+{
+
+    //Measuring performance
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+
+    // Creating files to write output to
+    FILE *fptr;
+    fptr = fopen("fft-2d-output.dat", "wr");
+
+    // Host arrays for input and output
+    Complex h_a[ARRAY_SIZE][ARRAY_SIZE];
+    Complex h_rev[ARRAY_SIZE][ARRAY_SIZE];
+
+    // Input signal, complex part remains zero.
+    // Signal is of the form sin(2*M_PI*f*x/N) or cos(2*M_PI*f*x/N)
+    // N is sample size and is always a power of 2 
+    for(int i = 0; i < ARRAY_SIZE; i++) 
+    {
+        for(int j = 0; j < ARRAY_SIZE; j++)
+        {
+            h_a[i][j].x = cos((6.0*M_PI*j)/ARRAY_SIZE);
+            h_a[i][j].y = 0.0;
+        }
+    }
+        
+    // No of bits required to represent N
+    int s = (int)ceil(log2(ARRAY_SIZE));
+
+    // Device arrays
+    Complex *d_a, *d_rev, *d_rev1;
+
+    // Memory Allocation
+    cudaMalloc((void**) &d_a, ARRAY_BYTES * ARRAY_SIZE);
+    cudaMalloc((void**) &d_rev, ARRAY_BYTES * ARRAY_SIZE);
+    cudaMalloc((void**) &d_rev1, ARRAY_BYTES * ARRAY_SIZE);
+
+    // Copy host array to device
+    cudaMemcpy(d_a, *h_a, ARRAY_BYTES * ARRAY_SIZE, cudaMemcpyHostToDevice);
+
+    //Start of performance measurement
+    cudaEventRecord(start);
+
+    // First step in FFT, bit-reverse reordering
+    // Swap an element at index 'i', with the element 
+    // which is the bit-string reversal of 'i' 
+        
+    bit_reverse_reorder<<<dim3(1, ARRAY_SIZE, 1), dim3(ARRAY_SIZE, 1, 1)>>>(d_rev, d_a, s);
+    
+    cudaDeviceSynchronize();
+
+    // FFT driver code for row wise FFT
+    // Assigning every row of the 2d fft to a block of threads and calling N/2 threads for each block
+    fft_2d <<<dim3(1, ARRAY_SIZE, 1), dim3(ARRAY_SIZE/2, 1, 1)>>> (d_rev, s);
+
+    // TODO : Code for matrix transpose - DONE
+
+    cudaMemset(d_rev1, 0, ARRAY_BYTES*ARRAY_SIZE);
+
+    // Taking transpose of d_rev and storing in d_rev1
+    MatrixTranspose<<<dim3(ARRAY_SIZE/TILE_DIM, ARRAY_SIZE/TILE_DIM, 1), dim3(TILE_DIM, BLOCK_ROWS, 1)>>>(d_rev1, d_rev);
+
+    // Column wise bit-reverse reordering of d_rev1, and storing in d_rev
+    bit_reverse_reorder<<<dim3(1, ARRAY_SIZE, 1), dim3(ARRAY_SIZE, 1, 1)>>>(d_rev, d_rev1, s);
+    
+    cudaDeviceSynchronize();
+
+    // FFT driver code for column wise FFT
+    
+    fft_2d <<<dim3(1, ARRAY_SIZE, 1), dim3(ARRAY_SIZE/2, 1, 1)>>> (d_rev, s);
+
+    //End of performance measurement
+    cudaEventRecord(stop);
+
+    //Block CPU execution until the event "stop" is recorded
+    cudaEventSynchronize(stop);
+
+    //Print the time taken in milliseconds
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
+    printf("The total time taken is %f milliseconds\n", milliseconds);
+
+    // Copy result array from device to host
+    cudaMemcpy(*h_rev, d_rev, ARRAY_BYTES * ARRAY_SIZE, cudaMemcpyDeviceToHost);
+
+    // Writing output to files
+    fprintf(fptr, "[i][j]\t\ta.magn\t\ta.real\t\t\ta.img\n");
+    for (int i = 0; i < ARRAY_SIZE; i++)
+    {
+        for(int j = 0; j < ARRAY_SIZE; j++)
+        {
+            fprintf(fptr,"[%d][%d]\t\t%f\t\t%f\t\t%f\n", i, j, magnitude(h_rev[i][j]), h_rev[i][j].x, h_rev[i][j].y);
+        }
+    }
+
+    // Free allocated device memory
+    cudaFree(d_a);
+    cudaFree(d_rev);
+    cudaFree(d_rev1);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 03ccb35..8e0233b 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,5 @@
 # Parallelizing-FFT
+
+    This project is an parallelized implementation of Fast Fourier 
+    Transform (specifically Cooley Tukey algorithm) using CUDA.
+
diff --git a/utils/1kplus250.wav b/utils/1kplus250.wav
new file mode 100644
index 0000000..6814b83
Binary files /dev/null and b/utils/1kplus250.wav differ