#包括< cuda_runtime.h >
#包括“device_launch_parameters.h”
#包括< stdio . h >
#包括“cuda.h”
#包括< iostream >
#包括< mex.h >
#包括“gpu / mxGPUArray.h”
#包括“matrix.h”
#包括<推力/ complex.h >
#包括< string.h >
//#定义gpuErrchk(ans) {gpuAssert((ans),__FILE__,__LINE__);}
//
/ /内联无效gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
/ / {
//如果(code != cudassuccess)
/ / {
/ /流(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(代码),文件,行);
//如果(中止)退出(代码);
/ /}
/ /}
//
类型定义推力:复杂的<飘> fcomp;
__device__无效atAddComplex(fcomp* a, fcomp b) {
Float * x = (Float *)a;/* cast x指向实部*/的指针
Float * y = x + 1;/*强制转换指向下面mem的Y指针。地址(虚部)*/
/ /使用atomicAdd对于双变量
atomicAdd (x, b.real ());
atomicAdd (y, b.imag ());
}
__global__无效add(fcomp * Device_DataRes, fcomp * Device_Data1, fcomp * Device_Data2, int N) {
intTID = threadIdx。y * blockDim。x + threadIdx.x;
intBlockOFFset = blockDim。x * blockDim。y * blockIdx.x;
intGID_RowBased = BlockOFFset + TID;
如果(GID_RowBased < N) {
//Device_DataRes[GID_RowBased] = Device_Data1[GID_RowBased] + Device_Data2[GID_RowBased];
//Device_Data1[GID_RowBased] = Device_Data1[GID_RowBased] + Device_Data2[GID_RowBased];
atAddComplex (&Device_Data1 GID_RowBased, Device_Data2 [GID_RowBased]);
// atomicAdd(&Device_Data1[GID_RowBased], Device_Data2[GID_RowBased]);
}
}
无效mexFunction(int nlhs, mxArray* plhs[]
intnrhs,常量mxArray* prhs[]) {
mxInitGPU ();
intN = 1000;
intArrayByteSize = sizeof(fcomp) * N;
fcomp * Device_Data1;
fcomp * Device_Data2;
fcomp *塔尔;
fcomp * Device_DataRes;
mxComplexSingle* Data1 = mxGetComplexSingles(prhs[0]);
mxComplexSingle* Data2 = mxGetComplexSingles(prhs[1]);
(cudaMalloc ((void * *) &Device_Data1 ArrayByteSize));
(cudaMemcpy(Device_Data1, Data1, ArrayByteSize, cud SoaMemcpyHostToDevice));
(cudaMalloc ((void * *) &Device_Data2 ArrayByteSize));
(cudaMemcpy(Device_Data2, Data2, ArrayByteSize, cudaMemcpyHostToDevice));
plhs[0] = mxCreateNumericMatrix(N, 1, mxSINGLE_CLASS, mxCOMPLEX);
DataRes = static_cast (mxGetData(plhs[0]));
(cudaMalloc ((void * *) &Device_DataRes ArrayByteSize));
dim3块(1024);
intGridX = (N /块。X + 1);
dim3网格(GridX)/ / SystemSetup。NumberOfTransmitter
添加< < <网格、块>> > (Device_DataRes, Device_Data1, Device_Data2, N);
(cudaMemcpy(DataRes, Device_Data1, ArrayByteSize, cudaMemcpyDeviceToHost));
cudaFree (Device_Data1);
cudaFree (Device_Data2);
cudaFree (Device_DataRes);
/ / mxGPUDestroyGPUArray (MediumX);
}