版塊導(dǎo)航: 正在加載中...

登錄注冊

應(yīng)《網(wǎng)絡(luò)安全法》要求，自2017年10月1日起，未進(jìn)行實(shí)名認(rèn)證將不得使用互聯(lián)網(wǎng)跟帖服務(wù)。為保障您的帳號能夠正常使用，請盡快對帳號進(jìn)行手機(jī)號驗(yàn)證，感謝您的理解與支持！

24小時熱門版塊排行榜

北京石油化工學(xué)院2026年研究生招生接收調(diào)劑公告

返回列表

木子化十文武

金蟲 (小有名氣)

應(yīng)助: 1 (幼兒園)
金幣: 1638.6
紅花: 7
帖子: 161
在線: 92.8小時
蟲號: 1282742
注冊: 2011-05-01
性別: MM
專業(yè): 凝聚態(tài)物性 II ：電子結(jié)構(gòu)

[求助] GPU加速倍數(shù)只有12倍，怎么提高加速倍數(shù)

本人是做量子Monte Carlo模擬的，最近在用GPU跑程序，發(fā)現(xiàn)相對于CPU來說，只加速了12倍，想了很久不知道怎么解決，求高手指教，只要有幫助，金幣可以增加！
（主是應(yīng)該就是調(diào)整DimGrid和DimBlock吧！）

// Implementation of the different kernels in the GPU using CUDA

#include "blaslapack.h"
#include "profile.h"
#include "kernel_gpu.h"

#include
#include

// Thread block size for all kernels
#define K 64

#ifdef DQMC_PROFILE

#define GPU_PROFILE_BEGIN() if (profile_enabled) cudaDeviceSynchronize(); \
                        PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n) if (profile_enabled) cudaDeviceSynchronize(); \
                           PROFILE_END(i, n)
#else

#define GPU_PROFILE_BEGIN()
#define GPU_PROFILE_END(i, n)

#endif

cuda_exception::cuda_exception(const char *file, int line, int code) {
  snprintf(message, sizeof(message), "CUDA error #%i at %s:%i\n",
         code, file, line);
}

const char* cuda_exception::what() const throw() { return message; }

// static stuff for the CUBLAS library

static cublasHandle_t handle = NULL;

void gpu_diag(int n, double *A, double *D)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasDcopy(handle, n, A, n + 1, D, 1));
  GPU_PROFILE_END(profile_dcopy, 16.0 * n);
}

__global__ void kernel_normcol(int n, double *A, double *D, double *c)
{
  int i, tid = threadIdx.x, j = blockIdx.x * n; // working column
  __shared__ volatile double shared[K];
  double t, w = 0.0, d = D[blockIdx.x];
  double *p = A + j + tid;
  for (i = tid; i < n; i += K) {
// t = A[j + i] * d;
t = *p * d;
// A[j + i] = t;
*p = t;
w += t * t;
p += K;
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
if (K >=  64) shared[tid] = w = w + shared[tid + 32];
if (K >=  32) shared[tid] = w = w + shared[tid + 16];
if (K >=  16) shared[tid] = w = w + shared[tid +  8];
if (K >= 8) shared[tid] = w = w + shared[tid +  4];
if (K >= 4) shared[tid] = w = w + shared[tid +  2];
if (K >= 2) shared[tid] = w = w + shared[tid +  1];
  }
  if (tid == 0)
c[blockIdx.x] = w;
}

void gpu_normcol(int n, double *A, double *D, double *c)
{
  GPU_PROFILE_BEGIN();
  kernel_normcol <<< n , K >>> (n, A, D, c);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_normcol, 2.0 * n * n);
}

__global__ void kernel_permute(int n, int *ipiv, double *A, double *Q)
{
  int i, j = blockIdx.x * n; // working column
  int p = ipiv[blockIdx.x] * n;
  double *pQ = Q + j + threadIdx.x;
  double *pA = A + p + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// Q[j + i] = A[p + i];
*pQ = *pA;
pQ += K; pA += K;
  }
}

void gpu_permute(int n, int *ipiv, double *A, double *Q)
{
  GPU_PROFILE_BEGIN();
  kernel_permute <<< n , K >>> (n, ipiv, A, Q);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_permute, 16.0 * n * n);
}

__global__ void kernel_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  int i, j = blockIdx.x; // working column
  int p = ipiv[j] * n;
  double *pT = T + p + threadIdx.x;
  double *pQ = Q + j * n + threadIdx.x;
  double *pD = D + threadIdx.x;
  for (i = threadIdx.x; i <= j; i += K) {
// T[p + i] = Q[j * n + i] / D; // T = D^-1*R*P
*pT = *pQ / *pD;
pT += K; pQ += K; pD += K;
  }
  for (; i < n; i += K) {
// T[p + i] = 0;
*pT = 0.0;
pT += K;
  }
}

void gpu_scalerowperm(int n, double *D, double *Q, int *ipiv, double *T)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowperm <<< n , K >>> (n, D, Q, ipiv, T);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowperm, 0.5 * n * n);
}

__global__ void kernel_scalerow(int n, double *h, double *B, double *M)
{
  int i, j = blockIdx.x * n; // working column
  double *pM = M + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  double *pB = B + j + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// M[j + i] = h * B[j + i];
*pM = *ph * *pB;
pM += K; ph += K; pB += K;
  }
}

void gpu_scalerow(int n, double *h, double *B, double *M)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerow <<< n , K >>> (n, h, B, M);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerow, 1.0 * n * n);
}

__global__ void kernel_scalerowcol(int n, double *h, double *G)
{
  // G = diag(h) * G * diag(h)^-1
  int i, j = blockIdx.x * n; // working column
  double t, f = 1.0 / h[blockIdx.x];
  double *pG = G + j + threadIdx.x;
  double *ph = h + threadIdx.x;
  for (i = threadIdx.x; i < n; i += K) {
// G[j + i] = h * G[j + i] / f;
t = *ph * *pG;
*pG = t * f;
pG += K; ph += K;
  }
}

void gpu_scalerowcol(int n, double *h, double *G)
{
  GPU_PROFILE_BEGIN();
  kernel_scalerowcol <<< n , K >>> (n, h, G);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_scalerowcol, 2.0 * n * n);
}

void gpu_dgemm(const char *trans, int m, int n, int k, double alpha, double *a,
            int lda, double *b, int ldb, double beta, double *c, int ldc)
{
  GPU_PROFILE_BEGIN();
  cublasOperation_t transa = trans[0] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  cublasOperation_t transb = trans[1] == 'N' ? CUBLAS_OP_N : CUBLAS_OP_T;
  CUDACHECK(cublasDgemm(handle, transa, transb, m, n, k, &alpha, a, lda, b, ldb, &beta, c, ldc));
  GPU_PROFILE_END(profile_dgemm, 2.0 * k * m * n);
}

void gpu_dgeqrf(int m, int n, double *dA, int ldda, double *tau, double *dT)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgeqrf3_gpu(m, n, dA, ldda, tau, dT, &info);
  if (info) throw(lapack_exception("GPU DGEQRF", info));
  GPU_PROFILE_END(profile_dgeqrf, 2.0*m*n*n - 2.0*n*n*n/3.0 + m*n + n*n +
               14.0*n/3.0);
}

void gpu_dorgqr(int m, int n, int k, double *da, int ldda, double *tau,
            double *dT, int nb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dorgqr_gpu(m, n, k, da, ldda, tau, dT, nb, &info);
  if (info) throw(lapack_exception("GPU DORGQR", info));
  GPU_PROFILE_END(profile_dorgqr, 4.0*m*n*k - 2.0*(m+n)*k*k + 4.0*k*k*k/3.0 +
                              3.0*n*k - m*k - k*k - 4.0*k/3.0);
}

void gpu_dgetrf(int m, int n, double *dA, int ldda, int *ipiv)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrf_gpu(m, n, dA, ldda, ipiv, &info);
  if (info) throw(lapack_exception("GPU DGETRF", info));
  GPU_PROFILE_END(profile_dgetrf, m*n*n - n*n*n/3.0 - n*n/2.0 + 5.0*n/6.0);
}

void gpu_dgetrs(const char *trans, int n, int nrhs, double *dA, int ldda,
            int *ipiv, double *dB, int lddb)
{
  int info;
  GPU_PROFILE_BEGIN();
  magma_dgetrs_gpu(trans[0], n, nrhs, dA, ldda, ipiv, dB, lddb, &info);
  if (info) throw(lapack_exception("GPU DGETRS", info));
  GPU_PROFILE_END(profile_dgetrs, nrhs * (2.0*n*n - n));
}

void gpu_setvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_getvector(int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetVector(n, size, src, 1, dst, 1));
  GPU_PROFILE_END(profile_transfer, 2.0 * n * size);
}

void gpu_setmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasSetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_getmatrix(int m, int n, int size, void *src, void *dst)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cublasGetMatrix(m, n, size, src, m, dst, m));
  GPU_PROFILE_END(profile_transfer, 2.0 * m * n * size);
}

void gpu_copy(void *dst, void *src, int size)
{
  GPU_PROFILE_BEGIN();
  CUDACHECK(cudaMemcpy(dst, src, size, cudaMemcpyDeviceToDevice))
  GPU_PROFILE_END(profile_dcopy, 2.0 * size);
}

__global__ void kernel_sort(int length, double *val, int *ind)
{
  const unsigned int idx = blockIdx.x;
  int i, tid = threadIdx.x;
  __shared__ volatile int shared[K];
  int w = 0;
  double myValue = val[idx];
  for (i = tid; i < length; i += K) {
if (myValue < val || (val == myValue && i > idx)) {
   w++;
}
  }
  shared[tid] = w;
  __syncthreads();
  if (K >= 512) { if (tid < 256) { shared[tid] = w = w + shared[tid + 256]; } __syncthreads(); }
  if (K >= 256) { if (tid < 128) { shared[tid] = w = w + shared[tid + 128]; } __syncthreads(); }
  if (K >= 128) { if (tid <  64) { shared[tid] = w = w + shared[tid +  64]; } __syncthreads(); }
  if (tid < 32) {
if (K >=  64) shared[tid] = w = w + shared[tid + 32];
if (K >=  32) shared[tid] = w = w + shared[tid + 16];
if (K >=  16) shared[tid] = w = w + shared[tid +  8];
if (K >= 8) shared[tid] = w = w + shared[tid +  4];
if (K >= 4) shared[tid] = w = w + shared[tid +  2];
if (K >= 2) shared[tid] = w = w + shared[tid +  1];
  }
  if (tid == 0)
ind[w] = idx;
}

void gpu_sort(int n, double *Db, int *ipiv)
{
  // GPU version
  int blocks = n / K;
  if (n % K) blocks++;
  GPU_PROFILE_BEGIN();
  kernel_sort <<< n , K >>> (n, Db, ipiv);
  CUDACHECK(cudaGetLastError());
  GPU_PROFILE_END(profile_sort, 8.0 * n * n);
}

void gpu_init()
{
  if (!handle) CUDACHECK(cublasCreate(&handle));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_normcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_permute, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowperm, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerow, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_scalerowcol, cudaFuncCachePreferL1));
  CUDACHECK(cudaFuncSetCacheConfig(kernel_sort, cudaFuncCachePreferL1));
}

void gpu_shutdown()
{
  if (handle) CUDACHECK(cublasDestroy(handle));
  handle = NULL;
}

回復(fù)此樓

» 猜你喜歡

調(diào)劑已經(jīng)有0人回復(fù)
【新加坡】納米電子器件項(xiàng)目組有“聯(lián)合培養(yǎng)博士生”名額已經(jīng)有0人回復(fù)
物理學(xué)I論文潤色/翻譯怎么收費(fèi)? 已經(jīng)有215人回復(fù)
中德博士后2026項(xiàng)目5月30日截止- 國內(nèi)博士后入站、德國兩年全職已經(jīng)有0人回復(fù)
面向全球申請人的中德博士后2026項(xiàng)目5月30日截止- 國內(nèi)博后入站德國兩年全職已經(jīng)有0人回復(fù)
【新加坡】納米電子器件項(xiàng)目組有“聯(lián)合培養(yǎng)博士生”名額已經(jīng)有2人回復(fù)
重慶交大材料學(xué)院26年碩士招生復(fù)試實(shí)施細(xì)則已出。歡迎加入光子學(xué)微結(jié)構(gòu)與器件課題組。已經(jīng)有0人回復(fù)
北工大宋曉艷教授團(tuán)隊(duì)招收2026年9月入學(xué)博士【學(xué)博和專博名額均有】！已經(jīng)有1人回復(fù)
天津城建大學(xué)理學(xué)院化學(xué)、環(huán)境工程、材料物理等方向接收10人已經(jīng)有0人回復(fù)
天津城建大學(xué)理學(xué)院化學(xué)、環(huán)境工程、材料物理等方向接收調(diào)劑，10人已經(jīng)有0人回復(fù)

平和！謙遜！

1樓 2013-01-07 23:20:44

已閱回復(fù)此樓關(guān)注TA 給TA發(fā)消息送TA紅花 TA的回帖

rlin198110

捐助貴賓 (正式寫手)

應(yīng)助: 1 (幼兒園)
金幣: 95.4
散金: 512
紅花: 3
帖子: 509
在線: 194.3小時
蟲號: 1997558
注冊: 2012-09-13
性別: GG
專業(yè): 化學(xué)動力學(xué)

【答案】應(yīng)助回帖

★
csgt0: 金幣+1, 應(yīng)助指數(shù)+1, 謝謝 2013-02-26 10:32:08

GPU并不是更快而是更寬。如果程序的數(shù)據(jù)密集程度趨高，加速就越明顯，也就是說同時處理的數(shù)據(jù)越多，加速比就趨高。我對于程序的算法不了解，但是上述原因是制約提速的關(guān)鍵之一。我計(jì)算譜圖時提速能達(dá)到單cpu核的2E3倍(i5 pk tesla c2050）。當(dāng)然與算法密切相關(guān)。

贊一下

回復(fù)此樓

務(wù)實(shí)自立

2樓2013-02-25 13:38:12

已閱回復(fù)此樓關(guān)注TA 給TA發(fā)消息送TA紅花 TA的回帖

rlin198110

捐助貴賓 (正式寫手)

應(yīng)助: 1 (幼兒園)
金幣: 95.4
散金: 512
紅花: 3
帖子: 509
在線: 194.3小時
蟲號: 1997558
注冊: 2012-09-13
性別: GG
專業(yè): 化學(xué)動力學(xué)

★
csgt0: 金幣+1, 歡迎交流 2013-02-26 10:32:16

我現(xiàn)在從事求解偏微分方程（反應(yīng)擴(kuò)散系統(tǒng)斑圖模擬），CUDA很適用。如有興趣多聯(lián)系。

贊一下

回復(fù)此樓

務(wù)實(shí)自立

3樓2013-02-25 13:41:37

已閱回復(fù)此樓關(guān)注TA 給TA發(fā)消息送TA紅花 TA的回帖

相關(guān)版塊跳轉(zhuǎn) 我要訂閱樓主木子化十文武的主題更新

返回列表

最具人氣熱帖推薦 [查看全部]		作者	回/看	最后發(fā)表

[考研] 279 分求調(diào)劑 +4	睡個好覺_16 2026-03-24	4/200	2026-03-27 15:05 by 醉在風(fēng)里
[考研] 一志愿太原理工安全工程300分，求調(diào)劑 +4	0857求調(diào)劑. 2026-03-24	4/200	2026-03-27 14:03 by 靜靜靜靜靜靜靜?/a>
[考研] 材料學(xué)碩333求調(diào)劑 +8	北道巷 2026-03-24	8/400	2026-03-27 10:18 by 我是小康
[碩博家園] 招收生物學(xué)/細(xì)胞生物學(xué)調(diào)劑 +3	IceGuo 2026-03-26	4/200	2026-03-27 05:35 by user003
[考研] 349求調(diào)劑 +4	李木子啊哈哈 2026-03-25	4/200	2026-03-26 22:49 by fmesaito
[考研] 327求調(diào)劑 +7	prayer13 2026-03-23	7/350	2026-03-26 20:48 by 不吃魚的貓
[考研] 286求調(diào)劑 +13	Faune 2026-03-21	13/650	2026-03-26 19:52 by peike
[考研] 材料277求調(diào)劑 +5	min3 2026-03-24	5/250	2026-03-26 15:13 by zzll406
[考研] 化學(xué)調(diào)劑一志愿上海交通大學(xué)336分-本科上海211 +4	小魚愛有機(jī) 2026-03-25	4/200	2026-03-26 10:19 by aa331100
[考研] 07化學(xué)303求調(diào)劑 +5	睿08 2026-03-25	5/250	2026-03-25 22:46 by 418490947
[考研] 一志愿中南大學(xué)化學(xué)學(xué)碩0703總分337求調(diào)劑 +7	niko- 2026-03-22	7/350	2026-03-25 20:14 by qingfeng258
[考研] 求b區(qū)院校調(diào)劑 +4	周56 2026-03-24	5/250	2026-03-25 17:12 by yishunmin
[考研] 0854人工智能方向招收調(diào)劑 +4	章小魚567 2026-03-24	4/200	2026-03-25 13:29 by 2177681040
[考研] 086003食品工程求調(diào)劑 +6	淼淼111 2026-03-24	6/300	2026-03-25 10:29 by 3Strings
[考研] 340求調(diào)劑 +5	話梅糖111 2026-03-24	5/250	2026-03-25 06:53 by ilovexiaobin
[考研] 311求調(diào)劑 +3	冬十三 2026-03-24	3/150	2026-03-24 21:31 by peike
[考研] 0703化學(xué)調(diào)劑，求導(dǎo)師收 +7	天天好運(yùn)來上岸?/a> 2026-03-24	7/350	2026-03-24 20:26 by peike
[考研] 一志愿吉大化學(xué)322求調(diào)劑 +4	17501029541 2026-03-23	6/300	2026-03-24 10:21 by 戴圍脖的小蚊子
[考研] 一志愿070300浙大化學(xué)358分，求調(diào)劑！ +4	酥酥魚.. 2026-03-21	4/200	2026-03-23 08:12 by Iveryant
[考研] 一志愿南大，0703化學(xué)，分?jǐn)?shù)336，求調(diào)劑 +3	收到VS 2026-03-21	3/150	2026-03-21 18:42 by 學(xué)員8dgXkO

亭亭五月天在线观看,亭亭五月天在线观看,国产最新av一区二区,国产 高清 中文字幕,99re热久久亚洲综合精品成人,熟妇 一区二区三区,一级做a爰片性色毛片武则天,美女的骚穴视频播放,国产美女午夜免费视频

24小時熱門版塊排行榜

木子化十文武

[求助] GPU加速倍數(shù)只有12倍，怎么提高加速倍數(shù)

» 猜你喜歡

rlin198110

【答案】應(yīng)助回帖

rlin198110

亭亭五月天在线观看,亭亭五月天在线观看,国产最新av一区二区,国产高清中文字幕,99re热久久亚洲综合精品成人,熟妇一区二区三区,一级做a爰片性色毛片武则天,美女的骚穴视频播放,国产美女午夜免费视频

[求助] GPU加速倍數(shù)只有12倍，怎么提高加速倍數(shù)