-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathmatrix_operations.cpp
More file actions
99 lines (80 loc) · 2.83 KB
/
matrix_operations.cpp
File metadata and controls
99 lines (80 loc) · 2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include "matrix_operations.h"
#include <iostream>
#include <random>
#include <chrono>
#include <stdexcept>
#include <immintrin.h> // AVX2 intrinsics
Matrix::Matrix(size_t r, size_t c) : rows(r), cols(c) {
data.resize(rows, std::vector<double>(cols, 0.0));
}
void Matrix::randomize() {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<> dis(0.0, 10.0);
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
data[i][j] = dis(gen);
}
}
}
Matrix Matrix::multiply(const Matrix& other) const {
if (cols != other.rows) {
throw std::runtime_error("Invalid matrix dimensions for multiplication");
}
Matrix result(rows, other.cols);
// x86-64 optimized using AVX2 for double-precision
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < other.cols; j++) {
__m256d sum_vec = _mm256_setzero_pd();
size_t k = 0;
// Process 4 elements at a time with AVX2
for (; k + 3 < cols; k += 4) {
__m256d a_vec = _mm256_loadu_pd(&data[i][k]);
__m256d b_vec = _mm256_set_pd(
other.data[k+3][j],
other.data[k+2][j],
other.data[k+1][j],
other.data[k][j]
);
sum_vec = _mm256_add_pd(sum_vec, _mm256_mul_pd(a_vec, b_vec));
}
// Horizontal add using AVX
__m128d sum_high = _mm256_extractf128_pd(sum_vec, 1);
__m128d sum_low = _mm256_castpd256_pd128(sum_vec);
__m128d sum_128 = _mm_add_pd(sum_low, sum_high);
double sum_arr[2];
_mm_storeu_pd(sum_arr, sum_128);
double sum = sum_arr[0] + sum_arr[1];
// Handle remaining elements
for (; k < cols; k++) {
sum += data[i][k] * other.data[k][j];
}
result.data[i][j] = sum;
}
}
return result;
}
double Matrix::sum() const {
double total = 0.0;
for (size_t i = 0; i < rows; i++) {
for (size_t j = 0; j < cols; j++) {
total += data[i][j];
}
}
return total;
}
void benchmark_matrix_ops() {
std::cout << "\n=== Matrix Multiplication Benchmark ===" << std::endl;
const size_t size = 200;
Matrix a(size, size);
Matrix b(size, size);
a.randomize();
b.randomize();
auto start = std::chrono::high_resolution_clock::now();
Matrix c = a.multiply(b);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
std::cout << "Matrix size: " << size << "x" << size << std::endl;
std::cout << "Time: " << duration.count() << " ms" << std::endl;
std::cout << "Result sum: " << c.sum() << std::endl;
}