| 3 | 1/1 | 返回列表 |
| 查看: 658 | 回復(fù): 2 | ||
木子化十文武金蟲 (小有名氣)
|
[求助]
GPU加速倍數(shù)只有12倍,怎么提高加速倍數(shù)
|
|
本人是做量子Monte Carlo模擬的,最近在用GPU跑程序,發(fā)現(xiàn)相對于CPU來說,只加速了12倍,想了很久不知道怎么解決,求高手指教,只要有幫助,金幣可以增加! (主是應(yīng)該就是調(diào)整DimGrid和DimBlock吧!) // Implementation of the different kernels in the GPU using CUDA #include "blaslapack.h" #include "profile.h" #include "kernel_gpu.h" #include #include // Thread block size for all kernels #define K 64 #ifdef DQMC_PROFILE #define GPU_PROFILE_BEGIN() if (profile_enabled) cudaDeviceSynchronize(); \ PROFILE_BEGIN() #define GPU_PROFILE_END(i, n) if (profile_enabled) cudaDeviceSynchronize(); \ PROFILE_END(i, n) #else #define GPU_PROFILE_BEGIN() #define GPU_PROFILE_END(i, n) #endif cuda_exception::cuda_exception(const char *file, int line, int code) { snprintf(message, sizeof(message), "CUDA error #%i at %s:%i\n", code, file, line); } const char* cuda_exception::what() const throw() { return message; } // static stuff for the CUBLAS library static cublasHandle_t handle = NULL; void gpu_diag(int n, double *A, double *D) { GPU_PROFILE_BEGIN(); CUDACHECK(cublasDcopy(handle, n, A, n + 1, D, 1)); GPU_PROFILE_END(profile_dcopy, 16.0 * n); } __global__ void kernel_normcol(int n, double *A, double *D, double *c) { int i, tid = threadIdx.x, j = blockIdx.x * n; // working column __shared__ volatile double shared[K]; double t, w = 0.0, d = D[blockIdx.x]; double *p = A + j + tid; for (i = tid; i < n; i += K) { // t = A[j + i] * d; t = *p * d; // A[j + i] = t; *p = t; w += t * t; p += K; } shared[tid] = w; __syncthreads(); if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); } if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); } if (K >= 128) { if (tid < 64) { shared[tid] = w = w + shared[tid + 64]; } __syncthreads(); } if (tid < 32) { if (K >= 64) shared[tid] = w = w + shared[tid + 32]; if (K >= 32) shared[tid] = w = w + shared[tid + 16]; if (K >= 16) shared[tid] = w = w + shared[tid + 8]; if (K >= 8) shared[tid] = w = w + shared[tid + 4]; if (K >= 4) shared[tid] = w = w + shared[tid + 2]; if (K >= 2) shared[tid] = w = w + shared[tid + 1]; } if (tid == 0) c[blockIdx.x] = w; } void gpu_normcol(int n, double *A, double *D, double *c) { GPU_PROFILE_BEGIN(); kernel_normcol <<< n , K >>> (n, A, D, c); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_normcol, 2.0 * n * n); } __global__ void kernel_permute(int n, int *ipiv, double *A, double *Q) { int i, j = blockIdx.x * n; // working column int p = ipiv[blockIdx.x] * n; double *pQ = Q + j + threadIdx.x; double *pA = A + p + threadIdx.x; for (i = threadIdx.x; i < n; i += K) { // Q[j + i] = A[p + i]; *pQ = *pA; pQ += K; pA += K; } } void gpu_permute(int n, int *ipiv, double *A, double *Q) { GPU_PROFILE_BEGIN(); kernel_permute <<< n , K >>> (n, ipiv, A, Q); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_permute, 16.0 * n * n); } __global__ void kernel_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T) { int i, j = blockIdx.x; // working column int p = ipiv[j] * n; double *pT = T + p + threadIdx.x; double *pQ = Q + j * n + threadIdx.x; double *pD = D + threadIdx.x; for (i = threadIdx.x; i <= j; i += K) { // T[p + i] = Q[j * n + i] / D; // T = D^-1*R*P *pT = *pQ / *pD; pT += K; pQ += K; pD += K; } for (; i < n; i += K) { // T[p + i] = 0; *pT = 0.0; pT += K; } } void gpu_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T) { GPU_PROFILE_BEGIN(); kernel_scalerowperm <<< n , K >>> (n, D, Q, ipiv, T); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_scalerowperm, 0.5 * n * n); } __global__ void kernel_scalerow(int n, double *h, double *B, double *M) { int i, j = blockIdx.x * n; // working column double *pM = M + j + threadIdx.x; double *ph = h + threadIdx.x; double *pB = B + j + threadIdx.x; for (i = threadIdx.x; i < n; i += K) { // M[j + i] = h * B[j + i]; *pM = *ph * *pB; pM += K; ph += K; pB += K; } } void gpu_scalerow(int n, double *h, double *B, double *M) { GPU_PROFILE_BEGIN(); kernel_scalerow <<< n , K >>> (n, h, B, M); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_scalerow, 1.0 * n * n); } __global__ void kernel_scalerowcol(int n, double *h, double *G) { // G = diag(h) * G * diag(h)^-1 int i, j = blockIdx.x * n; // working column double t, f = 1.0 / h[blockIdx.x]; double *pG = G + j + threadIdx.x; double *ph = h + threadIdx.x; for (i = threadIdx.x; i < n; i += K) { // G[j + i] = h * G[j + i] / f; t = *ph * *pG; *pG = t * f; pG += K; ph += K; } } void gpu_scalerowcol(int n, double *h, double *G) { GPU_PROFILE_BEGIN(); kernel_scalerowcol <<< n , K >>> (n, h, G); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_scalerowcol, 2.0 * n * n); } void gpu_dgemm(const char *trans, int m, int n, int k, double alpha, double *a, int lda, double *b, int ldb, double beta, double *c, int ldc) { GPU_PROFILE_BEGIN(); cublasOperation_t transa = trans[0] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t transb = trans[1] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T; CUDACHECK(cublasDgemm(handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc)); GPU_PROFILE_END(profile_dgemm, 2.0 * k * m * n); } void gpu_dgeqrf(int m, int n, double *dA, int ldda, double *tau, double *dT) { int info; GPU_PROFILE_BEGIN(); magma_dgeqrf3_gpu(m, n, dA, ldda, tau, dT, &info); if (info) throw(lapack_exception("GPU DGEQRF", info)); GPU_PROFILE_END(profile_dgeqrf, 2.0*m*n*n - 2.0*n*n*n/3.0 + m*n + n*n + 14.0*n/3.0); } void gpu_dorgqr(int m, int n, int k, double *da, int ldda, double *tau, double *dT, int nb) { int info; GPU_PROFILE_BEGIN(); magma_dorgqr_gpu(m, n, k, da, ldda, tau, dT, nb, &info); if (info) throw(lapack_exception("GPU DORGQR", info)); GPU_PROFILE_END(profile_dorgqr, 4.0*m*n*k - 2.0*(m+n)*k*k + 4.0*k*k*k/3.0 + 3.0*n*k - m*k - k*k - 4.0*k/3.0); } void gpu_dgetrf(int m, int n, double *dA, int ldda, int *ipiv) { int info; GPU_PROFILE_BEGIN(); magma_dgetrf_gpu(m, n, dA, ldda, ipiv, &info); if (info) throw(lapack_exception("GPU DGETRF", info)); GPU_PROFILE_END(profile_dgetrf, m*n*n - n*n*n/3.0 - n*n/2.0 + 5.0*n/6.0); } void gpu_dgetrs(const char *trans, int n, int nrhs, double *dA, int ldda, int *ipiv, double *dB, int lddb) { int info; GPU_PROFILE_BEGIN(); magma_dgetrs_gpu(trans[0], n, nrhs, dA, ldda, ipiv, dB, lddb, &info); if (info) throw(lapack_exception("GPU DGETRS", info)); GPU_PROFILE_END(profile_dgetrs, nrhs * (2.0*n*n - n)); } void gpu_setvector(int n, int size, void *src, void *dst) { GPU_PROFILE_BEGIN(); CUDACHECK(cublasSetVector(n, size, src, 1, dst, 1)); GPU_PROFILE_END(profile_transfer, 2.0 * n * size); } void gpu_getvector(int n, int size, void *src, void *dst) { GPU_PROFILE_BEGIN(); CUDACHECK(cublasGetVector(n, size, src, 1, dst, 1)); GPU_PROFILE_END(profile_transfer, 2.0 * n * size); } void gpu_setmatrix(int m, int n, int size, void *src, void *dst) { GPU_PROFILE_BEGIN(); CUDACHECK(cublasSetMatrix(m, n, size, src, m, dst, m)); GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size); } void gpu_getmatrix(int m, int n, int size, void *src, void *dst) { GPU_PROFILE_BEGIN(); CUDACHECK(cublasGetMatrix(m, n, size, src, m, dst, m)); GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size); } void gpu_copy(void *dst, void *src, int size) { GPU_PROFILE_BEGIN(); CUDACHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice)) GPU_PROFILE_END(profile_dcopy, 2.0 * size); } __global__ void kernel_sort(int length, double *val, int *ind) { const unsigned int idx = blockIdx.x; int i, tid = threadIdx.x; __shared__ volatile int shared[K]; int w = 0; double myValue = val[idx]; for (i = tid; i < length; i += K) { if (myValue < val || (val == myValue && i > idx)) { w++; } } shared[tid] = w; __syncthreads(); if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); } if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); } if (K >= 128) { if (tid < 64) { shared[tid] = w = w + shared[tid + 64]; } __syncthreads(); } if (tid < 32) { if (K >= 64) shared[tid] = w = w + shared[tid + 32]; if (K >= 32) shared[tid] = w = w + shared[tid + 16]; if (K >= 16) shared[tid] = w = w + shared[tid + 8]; if (K >= 8) shared[tid] = w = w + shared[tid + 4]; if (K >= 4) shared[tid] = w = w + shared[tid + 2]; if (K >= 2) shared[tid] = w = w + shared[tid + 1]; } if (tid == 0) ind[w] = idx; } void gpu_sort(int n, double *Db, int *ipiv) { // GPU version int blocks = n / K; if (n % K) blocks++; GPU_PROFILE_BEGIN(); kernel_sort <<< n , K >>> (n, Db, ipiv); CUDACHECK(cudaGetLastError()); GPU_PROFILE_END(profile_sort, 8.0 * n * n); } void gpu_init() { if (!handle) CUDACHECK(cublasCreate(&handle)); CUDACHECK(cudaFuncSetCacheConfig(kernel_normcol, cudaFuncCachePreferL1)); CUDACHECK(cudaFuncSetCacheConfig(kernel_permute, cudaFuncCachePreferL1)); CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowperm, cudaFuncCachePreferL1)); CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerow, cudaFuncCachePreferL1)); CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowcol, cudaFuncCachePreferL1)); CUDACHECK(cudaFuncSetCacheConfig(kernel_sort, cudaFuncCachePreferL1)); } void gpu_shutdown() { if (handle) CUDACHECK(cublasDestroy(handle)); handle = NULL; } |

捐助貴賓 (正式寫手)

捐助貴賓 (正式寫手)

| 3 | 1/1 | 返回列表 |
| 最具人氣熱帖推薦 [查看全部] | 作者 | 回/看 | 最后發(fā)表 | |
|---|---|---|---|---|
|
[考研] 279 分 求調(diào)劑 +4 | 睡個好覺_16 2026-03-24 | 4/200 |
|
|---|---|---|---|---|
|
[考研] 一志愿太原理工安全工程300分,求調(diào)劑 +4 | 0857求調(diào)劑. 2026-03-24 | 4/200 |
|
|
[考研] 材料學(xué)碩333求調(diào)劑 +8 | 北道巷 2026-03-24 | 8/400 |
|
|
[碩博家園] 招收生物學(xué)/細(xì)胞生物學(xué)調(diào)劑 +3 | IceGuo 2026-03-26 | 4/200 |
|
|
[考研] 349求調(diào)劑 +4 | 李木子啊哈哈 2026-03-25 | 4/200 |
|
|
[考研] 327求調(diào)劑 +7 | prayer13 2026-03-23 | 7/350 |
|
|
[考研] 286求調(diào)劑 +13 | Faune 2026-03-21 | 13/650 |
|
|
[考研] 材料277求調(diào)劑 +5 | min3 2026-03-24 | 5/250 |
|
|
[考研] 化學(xué)調(diào)劑一志愿上海交通大學(xué)336分-本科上海211 +4 | 小魚愛有機(jī) 2026-03-25 | 4/200 |
|
|
[考研] 07化學(xué)303求調(diào)劑 +5 | 睿08 2026-03-25 | 5/250 |
|
|
[考研] 一志愿中南大學(xué)化學(xué)學(xué)碩0703總分337求調(diào)劑 +7 | niko- 2026-03-22 | 7/350 |
|
|
[考研] 求b區(qū)院校調(diào)劑 +4 | 周56 2026-03-24 | 5/250 |
|
|
[考研] 0854人工智能方向招收調(diào)劑 +4 | 章小魚567 2026-03-24 | 4/200 |
|
|
[考研] 086003食品工程求調(diào)劑 +6 | 淼淼111 2026-03-24 | 6/300 |
|
|
[考研] 340求調(diào)劑 +5 | 話梅糖111 2026-03-24 | 5/250 |
|
|
[考研] 311求調(diào)劑 +3 | 冬十三 2026-03-24 | 3/150 |
|
|
[考研] 0703化學(xué)調(diào)劑,求導(dǎo)師收 +7 | 天天好運(yùn)來上岸?/a> 2026-03-24 | 7/350 |
|
|
[考研] 一志愿吉大化學(xué)322求調(diào)劑 +4 | 17501029541 2026-03-23 | 6/300 |
|
|
[考研]
|
酥酥魚.. 2026-03-21 | 4/200 |
|
|
[考研] 一志愿南大,0703化學(xué),分?jǐn)?shù)336,求調(diào)劑 +3 | 收到VS 2026-03-21 | 3/150 |
|