-
-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy path03_matrix_multiplication_hip.cpp
More file actions
361 lines (298 loc) · 12.4 KB
/
03_matrix_multiplication_hip.cpp
File metadata and controls
361 lines (298 loc) · 12.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
#include <hip/hip_runtime.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
// Simple matrix multiplication kernel (naive implementation)
__global__ void matrixMultiply(float *A, float *B, float *C, int N) {
// Calculate row and column for this thread
int row = hipBlockIdx_y * hipBlockDim_y + hipThreadIdx_y;
int col = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x;
if (row < N && col < N) {
float sum = 0.0f;
// Compute dot product of row from A and column from B
for (int k = 0; k < N; k++) {
sum += A[row * N + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
// Optimized matrix multiplication using shared memory (tiled)
__global__ void matrixMultiplyTiled(float *A, float *B, float *C, int N) {
// Tile size (matches block size)
const int TILE_SIZE = 16;
// Shared memory for tiles of A and B
__shared__ float tileA[16][16];
__shared__ float tileB[16][16];
int row = hipBlockIdx_y * TILE_SIZE + hipThreadIdx_y;
int col = hipBlockIdx_x * TILE_SIZE + hipThreadIdx_x;
float sum = 0.0f;
// Loop over tiles
for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
// Load tile into shared memory
if (row < N && (t * TILE_SIZE + hipThreadIdx_x) < N) {
tileA[hipThreadIdx_y][hipThreadIdx_x] = A[row * N + t * TILE_SIZE + hipThreadIdx_x];
} else {
tileA[hipThreadIdx_y][hipThreadIdx_x] = 0.0f;
}
if (col < N && (t * TILE_SIZE + hipThreadIdx_y) < N) {
tileB[hipThreadIdx_y][hipThreadIdx_x] = B[(t * TILE_SIZE + hipThreadIdx_y) * N + col];
} else {
tileB[hipThreadIdx_y][hipThreadIdx_x] = 0.0f;
}
__syncthreads();
// Compute partial sum using shared memory
for (int k = 0; k < TILE_SIZE; k++) {
sum += tileA[hipThreadIdx_y][k] * tileB[k][hipThreadIdx_x];
}
__syncthreads();
}
// Write result
if (row < N && col < N) {
C[row * N + col] = sum;
}
}
// Platform-specific optimized version
#ifdef __HIP_PLATFORM_AMD__
__global__ void matrixMultiplyAMDOptimized(float *A, float *B, float *C, int N) {
// AMD-specific optimizations
const int TILE_SIZE = 16;
__shared__ float tileA[16][16];
__shared__ float tileB[16][16];
int row = hipBlockIdx_y * TILE_SIZE + hipThreadIdx_y;
int col = hipBlockIdx_x * TILE_SIZE + hipThreadIdx_x;
float sum = 0.0f;
// ROCm 7: Use clang loop optimization hints instead of fixed unroll count
// The compiler will determine optimal unrolling based on target architecture
for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
// Coalesced loads
tileA[hipThreadIdx_y][hipThreadIdx_x] =
(row < N && (t * TILE_SIZE + hipThreadIdx_x) < N) ?
A[row * N + t * TILE_SIZE + hipThreadIdx_x] : 0.0f;
tileB[hipThreadIdx_y][hipThreadIdx_x] =
(col < N && (t * TILE_SIZE + hipThreadIdx_y) < N) ?
B[(t * TILE_SIZE + hipThreadIdx_y) * N + col] : 0.0f;
__syncthreads();
// Unrolled computation for better pipeline utilization
#pragma unroll
for (int k = 0; k < TILE_SIZE; k++) {
sum += tileA[hipThreadIdx_y][k] * tileB[k][hipThreadIdx_x];
}
__syncthreads();
}
if (row < N && col < N) {
C[row * N + col] = sum;
}
}
#elif defined(__HIP_PLATFORM_NVIDIA__)
__global__ void matrixMultiplyNVIDIAOptimized(float *A, float *B, float *C, int N) {
// NVIDIA-specific optimizations
const int TILE_SIZE = 16;
__shared__ float tileA[16][16];
__shared__ float tileB[16][16];
int row = hipBlockIdx_y * TILE_SIZE + hipThreadIdx_y;
int col = hipBlockIdx_x * TILE_SIZE + hipThreadIdx_x;
float sum = 0.0f;
for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
// Use texture cache hints for better memory performance
tileA[hipThreadIdx_y][hipThreadIdx_x] =
(row < N && (t * TILE_SIZE + hipThreadIdx_x) < N) ?
__ldg(&A[row * N + t * TILE_SIZE + hipThreadIdx_x]) : 0.0f;
tileB[hipThreadIdx_y][hipThreadIdx_x] =
(col < N && (t * TILE_SIZE + hipThreadIdx_y) < N) ?
__ldg(&B[(t * TILE_SIZE + hipThreadIdx_y) * N + col]) : 0.0f;
__syncthreads();
#pragma unroll
for (int k = 0; k < TILE_SIZE; k++) {
sum = __fmaf_rn(tileA[hipThreadIdx_y][k], tileB[k][hipThreadIdx_x], sum);
}
__syncthreads();
}
if (row < N && col < N) {
C[row * N + col] = sum;
}
}
#endif
#define HIP_CHECK(call) \
do { \
hipError_t error = call; \
if (error != hipSuccess) { \
fprintf(stderr, "HIP error at %s:%d - %s\n", __FILE__, __LINE__, \
hipGetErrorString(error)); \
exit(EXIT_FAILURE); \
} \
} while(0)
// CPU matrix multiplication for verification
void matrixMultiplyCPU(float *A, float *B, float *C, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
for (int k = 0; k < N; k++) {
sum += A[i * N + k] * B[k * N + j];
}
C[i * N + j] = sum;
}
}
}
int main() {
// Matrix size (N x N)
const int N = 512; // Start with smaller size for demonstration
const int size = N * N * sizeof(float);
printf("HIP Matrix Multiplication: %dx%d matrices\n", N, N);
// Get device information
int device;
hipDeviceProp_t props;
HIP_CHECK(hipGetDevice(&device));
HIP_CHECK(hipGetDeviceProperties(&props, device));
printf("Running on: %s\n", props.name);
printf("Platform: ");
#ifdef __HIP_PLATFORM_AMD__
printf("AMD ROCm\n");
#elif defined(__HIP_PLATFORM_NVIDIA__)
printf("NVIDIA CUDA\n");
#else
printf("Unknown\n");
#endif
// Host matrices
float *h_A = (float*)malloc(size);
float *h_B = (float*)malloc(size);
float *h_C = (float*)malloc(size);
float *h_C_ref = (float*)malloc(size); // CPU reference
// Initialize matrices
printf("Initializing matrices...\n");
srand(42); // Fixed seed for reproducible results
for (int i = 0; i < N * N; i++) {
h_A[i] = rand() / (float)RAND_MAX;
h_B[i] = rand() / (float)RAND_MAX;
}
// Device matrices
float *d_A, *d_B, *d_C;
HIP_CHECK(hipMalloc(&d_A, size));
HIP_CHECK(hipMalloc(&d_B, size));
HIP_CHECK(hipMalloc(&d_C, size));
// Copy matrices to device
HIP_CHECK(hipMemcpy(d_A, h_A, size, hipMemcpyHostToDevice));
HIP_CHECK(hipMemcpy(d_B, h_B, size, hipMemcpyHostToDevice));
// Launch configuration
dim3 blockSize(16, 16);
dim3 gridSize((N + blockSize.x - 1) / blockSize.x, (N + blockSize.y - 1) / blockSize.y);
// Create events for timing
hipEvent_t start, stop;
HIP_CHECK(hipEventCreate(&start));
HIP_CHECK(hipEventCreate(&stop));
printf("\n=== Performance Comparison ===\n");
// Test naive implementation
printf("Running naive HIP kernel...\n");
HIP_CHECK(hipEventRecord(start));
matrixMultiply<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipEventSynchronize(stop));
float naiveTime;
HIP_CHECK(hipEventElapsedTime(&naiveTime, start, stop));
// Test tiled implementation
printf("Running tiled HIP kernel...\n");
HIP_CHECK(hipEventRecord(start));
matrixMultiplyTiled<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipEventSynchronize(stop));
float tiledTime;
HIP_CHECK(hipEventElapsedTime(&tiledTime, start, stop));
// Test platform-specific optimized version
float optimizedTime = 0.0f;
bool hasOptimized = false;
#ifdef __HIP_PLATFORM_AMD__
printf("Running AMD-optimized kernel...\n");
HIP_CHECK(hipEventRecord(start));
matrixMultiplyAMDOptimized<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipEventSynchronize(stop));
HIP_CHECK(hipEventElapsedTime(&optimizedTime, start, stop));
hasOptimized = true;
#elif defined(__HIP_PLATFORM_NVIDIA__)
printf("Running NVIDIA-optimized kernel...\n");
HIP_CHECK(hipEventRecord(start));
matrixMultiplyNVIDIAOptimized<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
HIP_CHECK(hipEventRecord(stop));
HIP_CHECK(hipEventSynchronize(stop));
HIP_CHECK(hipEventElapsedTime(&optimizedTime, start, stop));
hasOptimized = true;
#endif
// Copy final result back
HIP_CHECK(hipMemcpy(h_C, d_C, size, hipMemcpyDeviceToHost));
// CPU computation for verification (small subset)
printf("Running CPU verification...\n");
int checkSize = (N > 64) ? 64 : N; // Only verify subset for speed
matrixMultiplyCPU(h_A, h_B, h_C_ref, checkSize);
// Verify result
bool correct = true;
for (int i = 0; i < checkSize && correct; i++) {
for (int j = 0; j < checkSize && correct; j++) {
int gpu_idx = i * N + j;
int cpu_idx = i * checkSize + j;
if (fabs(h_C[gpu_idx] - h_C_ref[cpu_idx]) > 1e-3) {
correct = false;
printf("Mismatch at (%d,%d): GPU=%.6f, CPU=%.6f\n",
i, j, h_C[gpu_idx], h_C_ref[cpu_idx]);
}
}
}
// Performance analysis
printf("\n=== Performance Results ===\n");
printf("Matrix size: %dx%d\n", N, N);
printf("Naive time: %.3f ms\n", naiveTime);
printf("Tiled time: %.3f ms\n", tiledTime);
if (hasOptimized) {
printf("Optimized time: %.3f ms\n", optimizedTime);
printf("Tiled vs Naive speedup: %.2fx\n", naiveTime / tiledTime);
printf("Optimized vs Tiled speedup: %.2fx\n", tiledTime / optimizedTime);
printf("Overall speedup: %.2fx\n", naiveTime / optimizedTime);
} else {
printf("Tiled speedup: %.2fx\n", naiveTime / tiledTime);
}
// Calculate GFLOPS
double gflops = (2.0 * N * N * N) / 1e9; // 2N^3 operations
printf("\nPerformance (GFLOPS):\n");
printf(" Naive: %.2f GFLOPS\n", gflops / (naiveTime / 1000.0));
printf(" Tiled: %.2f GFLOPS\n", gflops / (tiledTime / 1000.0));
if (hasOptimized) {
printf(" Optimized: %.2f GFLOPS\n", gflops / (optimizedTime / 1000.0));
}
printf("Verification: %s\n", correct ? "PASSED" : "FAILED");
// Memory and compute analysis
printf("\n=== Analysis ===\n");
printf("Total operations: %.2f billion\n", gflops);
printf("Arithmetic intensity: %.2f ops/byte\n",
(2.0 * N * N * N) / (3.0 * N * N * sizeof(float)));
double bestTime = hasOptimized ? optimizedTime : tiledTime;
double bytesTransferred = 3.0 * size; // Read A, B and write C
printf("Memory bandwidth achieved: %.2f GB/s\n",
(bytesTransferred / (1024.0 * 1024.0 * 1024.0)) / (bestTime / 1000.0));
// Theoretical peak bandwidth
double theoreticalBW = 2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6;
printf("Theoretical peak bandwidth: %.2f GB/s\n", theoreticalBW);
printf("Bandwidth efficiency: %.1f%%\n",
((bytesTransferred / (1024.0 * 1024.0 * 1024.0)) / (bestTime / 1000.0) / theoreticalBW) * 100.0);
// Platform-specific insights
printf("\nPlatform-specific optimizations:\n");
#ifdef __HIP_PLATFORM_AMD__
printf(" - Used wavefront-optimized memory access patterns\n");
printf(" - Applied loop unrolling for better instruction scheduling\n");
printf(" - Optimized for AMD GCN architecture\n");
printf(" - Wavefront size: %d\n", props.warpSize);
#elif defined(__HIP_PLATFORM_NVIDIA__)
printf(" - Used texture cache with __ldg() for reads\n");
printf(" - Applied fused multiply-add with __fmaf_rn()\n");
printf(" - Optimized for NVIDIA warp execution\n");
printf(" - Warp size: %d\n", props.warpSize);
#else
printf(" - Generic optimizations applied\n");
#endif
// Cleanup
free(h_A); free(h_B); free(h_C); free(h_C_ref);
HIP_CHECK(hipFree(d_A));
HIP_CHECK(hipFree(d_B));
HIP_CHECK(hipFree(d_C));
HIP_CHECK(hipEventDestroy(start));
HIP_CHECK(hipEventDestroy(stop));
printf("\nHIP matrix multiplication completed successfully!\n");
return 0;
}