NVIDIA Korea - GPGPU Programming with CUDA (Part 2)

2012. 12. 11. 23:36

NVIDIA Debugging Tool Parallel Nsight 실행하기

Line(2): 프로그램이 java기반이라는 것을 알 수 있다.

[guest13@gpu03 ukletter]$ nsight
CompilerOracle: exclude java/lang/reflect/Array.newInstance
[guest13@gpu03 ukletter]$

파일 둘러보기

[guest13@gpu03 ukletter]$ cd /usr/local/cuda
[guest13@gpu03 cuda]$ pwd
/usr/local/cuda
[guest13@gpu03 cuda]$ ls
bin  doc  extras  include  jre  lib  lib64  libnsight  libnvvp  nvvm  open64  samples  src  tools
[guest13@gpu03 cuda]$ cd doc
[guest13@gpu03 doc]$ ls
CUDA_Toolkit_Release_Notes.txt  EULA.txt  html  pdf
[guest13@gpu03 doc]$ cd pdf
[guest13@gpu03 pdf]$ ls
CUDA_C_Best_Practices_Guide.pdf                       CUDA_Toolkit_Reference_Manual.pdf
CUDA_Compiler_Driver_NVCC.pdf                         CUDA_Toolkit_Release_Notes.pdf
CUDA_C_Programming_Guide.pdf                          CUDA_VideoDecoder_Library.pdf
CUDA_CUBLAS_Users_Guide.pdf                           cuobjdump.pdf
CUDA_CUFFT_Users_Guide.pdf                            CUPTI_User_Guide.pdf
CUDA_CUSPARSE_Users_Guide.pdf                         CURAND_Library.pdf
CUDA_Debugger_API.pdf                                 Floating_Point_on_NVIDIA_GPU_White_Paper.pdf
CUDA_Developer_Guide_for_Optimus_Platforms.pdf        Getting_Started_With_CUDA_Samples.pdf
CUDA_Dynamic_Parallelism_Programming_Guide.pdf        GPUDirect_RDMA.pdf
CUDA_GDB.pdf                                          Kepler_Compatibility_Guide.pdf
CUDA_Getting_Started_Guide_For_Linux.pdf              Kepler_Tuning_Guide.pdf
CUDA_Getting_Started_Guide_For_Mac_OS_X.pdf           NPP_Library.pdf
CUDA_Getting_Started_Guide_For_Microsoft_Windows.pdf  Nsight_Eclipse_Edition_Getting_Started.pdf
CUDA_Memcheck.pdf                                     Preconditioned_Iterative_Methods_White_Paper.pdf
CUDA_Profiler_Users_Guide.pdf                         ptx_isa_3.1.pdf
CUDA_Samples_Guide_To_New_Features.pdf                qwcode.highlight.css
CUDA_Samples.pdf                                      Thrust_Quick_Start_Guide.pdf
CUDA_Samples_Release_Notes.pdf                        Using_Inline_PTX_Assembly_In_CUDA.pdf
[guest13@gpu03 pdf]$ cd ..
[guest13@gpu03 doc]$ ls
CUDA_Toolkit_Release_Notes.txt  EULA.txt  html  pdf
[guest13@gpu03 doc]$ cd ..
[guest13@gpu03 cuda]$ ls
bin  doc  extras  include  jre  lib  lib64  libnsight  libnvvp  nvvm  open64  samples  src  tools
[guest13@gpu03 cuda]$ cd samples
[guest13@gpu03 samples]$ cd bin/linux/release
[guest13@gpu03 release]$ ls
alignedTypes              dxtc                    MonteCarloMultiGPU      simpleMultiGPU
asyncAPI                  eigenvalues             nbody                   simpleP2P
bandwidthTest             fastWalshTransform      newdelete               simplePitchLinearTexture
batchCUBLAS               FDTD3d                  oceanFFT                simplePrintf
bicubicTexture            fluidsGL                particles               simpleSeparateCompilation
bilateralFilter           freeImageInteropNPP     postProcessGL           simpleStreams
bindlessTexture           FunctionPointers        ptxjit                  simpleSurfaceWrite
binomialOptions           grabcutNPP              quasirandomGenerator    simpleTemplates
BlackScholes              histEqualizationNPP     radixSortThrust         simpleTexture
boxFilter                 histogram               randomFog               simpleTexture3D
boxFilterNPP              HSOpticalFlow           recursiveGaussian       simpleTextureDrv
cdpAdvancedQuicksort      imageDenoising          reduction               simpleVoteIntrinsics
cdpLUDecomposition        imageSegmentationNPP    scalarProd              simpleZeroCopy
cdpQuadtree               inlinePTX               scan                    smokeParticles
cdpSimplePrint            interval                segmentationTreeThrust  SobelFilter
cdpSimpleQuicksort        lineOfSight             shfl_scan               SobolQRNG
clock                     Mandelbrot              simpleAssert            sortingNetworks
concurrentKernels         marchingCubes           simpleAtomicIntrinsics  stereoDisparity
conjugateGradient         matrixMul               simpleCallback          template
conjugateGradientPrecond  matrixMulCUBLAS         simpleCubemapTexture    template_runtime
convolutionFFT2D          matrixMulDrv            simpleCUBLAS            threadFenceReduction
convolutionSeparable      matrixMulDynlinkJIT     simpleCUFFT             threadMigration
convolutionTexture        MC_EstimatePiInlineP    simpleDevLibCUBLAS      transpose
cppIntegration            MC_EstimatePiInlineQ    simpleGL                vectorAdd
cudaOpenMP                MC_EstimatePiP          simpleHyperQ            vectorAddDrv
dct8x8                    MC_EstimatePiQ          simpleIPC               volumeFiltering
deviceQuery               MC_SingleAsianOptionP   simpleLayeredTexture    volumeRender
deviceQueryDrv            mergeSort               simpleMPI
dwtHaar1D                 MersenneTwisterGP11213  simpleMultiCopy
[guest13@gpu03 release]$

deviceQuery 명령어로 Device 정보 확인하기

[guest13@gpu03 ukletter]$ deviceQuery
deviceQuery Starting...

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 2 CUDA Capable device(s)

Device 0: "Tesla S2050"
  CUDA Driver Version / Runtime Version          5.0 / 5.0
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           2 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

Device 1: "Tesla M2050"
  CUDA Driver Version / Runtime Version          5.0 / 5.0
  CUDA Capability Major/Minor version number:    2.0
  Total amount of global memory:                 3072 MBytes (3220897792 bytes)
  (14) Multiprocessors x ( 32) CUDA Cores/MP:    448 CUDA Cores
  GPU Clock rate:                                1147 MHz (1.15 GHz)
  Memory Clock rate:                             1546 Mhz
  Memory Bus Width:                              384-bit
  L2 Cache Size:                                 786432 bytes
  Max Texture Dimension Size (x,y,z)             1D=(65536), 2D=(65536,65535), 3D=(2048,2048,2048)
  Max Layered Texture Size (dim) x layers        1D=(16384) x 2048, 2D=(16384,16384) x 2048
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 32768
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Maximum sizes of each dimension of a block:    1024 x 1024 x 64
  Maximum sizes of each dimension of a grid:     65535 x 65535 x 65535
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 2 copy engine(s)
  Run time limit on kernels:                     No
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  Device supports Unified Addressing (UVA):      Yes
  Device PCI Bus ID / PCI location ID:           3 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 5.0, CUDA Runtime Version = 5.0, NumDevs = 2, Device0 = Tesla S2050, Device1 = Tesla M2050
[guest13@gpu03 ukletter]$

bandwidthTest 명령어로 대역폭 측정하기

Line(33, 55): Host to Device Bandwidth - Pageable(2650.2) < Pinned(4164.0)
Line(38, 60): Device to Host Bandwidth - Pageable(2176.0) < Pinned(4804.6)
Line(176, 287): 실제로 수치를 찍어서 확인해보면, Device에서 Host로의 전송일 때, 1024 bytes 만큼씩 키워가며 대역폭을 확인해보면, 102400 bytes를 전송할 때 Pinned Memory가 Pageable Memory보다 속도가 약 4배 이상 빠른 것을 확인할 수 있다.
Pinned Memory가 Pageable Memory 보다 빨라서 Pinned Memory를 주로 사용(권장사항)한다. Pinned Memory는 OS에서 메모리영역을 참조할 수 없기 때문에 Page Lock Memory라고 한다. 반면에, Pageable Memory는 OS가 참조하여 사용할 수 있다.
참고로, PCI Express 슬롯의 최대 전송속도는 8GBytes/s인데 Pinned Memory의 경우 102400 bytes를 전송할 때 약 4GBytes/s의 속도가 나온다. 아래의 데이터를 엑셀에 옮겨서 그래프를 드려보면 형상을 볼 수 있는데 전송량이 더운 커지면 8GBytes/s의 속도에 좀더 가까워질 수 있다.

[guest13@gpu03 ukletter]$ bandwidthTest
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla S2050
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            3639.8

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            5681.8

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            124251.4

[guest13@gpu03 ukletter]$ bandwidthTest --memory=pageable
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla S2050
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            2650.2

 Device to Host Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            2176.0

 Device to Device Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            102549.9

[guest13@gpu03 ukletter]$ bandwidthTest --memory=pinned
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla S2050
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            4164.0

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            4804.6

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   33554432            73970.2

[guest13@gpu03 ukletter]$ bandwidthTest --memory=pageable --mode=range --dtoh --start=1024 --end=102400 --increment=1024
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla S2050
 Range Mode

 Device to Host Bandwidth, 1 Device(s)
 PAGEABLE Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   1024                50.6
   2048                96.7
   3072                146.5
   4096                186.9
   5120                62.0
   6144                260.4
   7168                281.3
   8192                333.9
   9216                355.8
   10240            398.6
   11264            397.9
   12288            302.0
   13312            480.9
   14336            361.7
   15360            579.0
   16384            467.8
   17408            626.5
   18432            489.6
   19456            669.8
   20480            504.7
   21504            683.6
   22528            577.5
   23552            746.2
   24576            594.9
   25600            748.9
   26624            695.6
   27648            787.1
   28672            783.5
   29696            737.5
   30720            813.8
   31744            760.6
   32768            842.3
   33792            871.0
   34816            763.3
   35840            906.6
   36864            841.1
   37888            885.6
   38912            916.3
   39936            976.6
   40960            879.8
   41984            946.5
   43008            891.6
   44032            950.0
   45056            916.2
   46080            721.6
   47104            1079.9
   48128            866.0
   49152            1021.2
   50176            759.5
   51200            742.1
   52224            735.7
   53248            753.4
   54272            774.8
   55296            785.9
   56320            780.7
   57344            782.4
   58368            802.1
   59392            820.9
   60416            811.5
   61440            738.9
   62464            810.5
   63488            798.8
   64512            828.0
   65536            827.8
   66560            859.0
   67584            807.7
   68608            832.4
   69632            855.8
   70656            838.1
   71680            837.7
   72704            832.4
   73728            827.2
   74752            832.8
   75776            843.2
   76800            889.9
   77824            865.0
   78848            880.5
   79872            829.8
   80896            934.0
   81920            1103.5
   82944            1144.7
   83968            1142.3
   84992            903.6
   86016            894.6
   87040            919.2
   88064            920.9
   89088            885.9
   90112            912.3
   91136            907.2
   92160            920.3
   93184            890.5
   94208            933.0
   95232            933.4
   96256            964.3
   97280            974.5
   98304            920.0
   99328            968.6
   100352            922.0
   101376            962.0
   102400            975.6

[guest13@gpu03 ukletter]$ bandwidthTest --memory=pinned --mode=range --dtoh --start=1024 --end=102400 --increment=1024
[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: Tesla S2050
 Range Mode

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)    Bandwidth(MB/s)
   1024                187.5
   2048                484.0
   3072                706.4
   4096                893.6
   5120                1067.8
   6144                1231.4
   7168                1394.4
   8192                1531.6
   9216                1652.6
   10240            1809.0
   11264            1823.4
   12288            1965.7
   13312            2089.1
   14336            2227.6
   15360            2306.1
   16384            2429.3
   17408            2539.4
   18432            2545.5
   19456            2627.2
   20480            2719.9
   21504            2769.5
   22528            2848.5
   23552            2899.2
   24576            2960.5
   25600            2983.7
   26624            3076.6
   27648            3074.5
   28672            3106.1
   29696            3163.0
   30720            3183.3
   31744            3231.0
   32768            3319.4
   33792            3213.4
   34816            3320.3
   35840            3309.9
   36864            3419.3
   37888            3518.7
   38912            3549.6
   39936            3573.1
   40960            3585.0
   41984            3614.2
   43008            3645.4
   44032            3651.2
   45056            3698.1
   46080            3714.6
   47104            3717.7
   48128            3775.5
   49152            3798.9
   50176            3769.5
   51200            3799.5
   52224            3867.8
   53248            3816.5
   54272            3832.8
   55296            3913.4
   56320            3798.3
   57344            3837.0
   58368            3914.3
   59392            3958.0
   60416            3966.8
   61440            4036.7
   62464            4059.3
   63488            4065.5
   64512            4051.0
   65536            4049.6
   66560            4091.7
   67584            4113.9
   68608            4146.6
   69632            4137.2
   70656            4099.9
   71680            4198.6
   72704            4212.2
   73728            4193.3
   74752            4174.2
   75776            4210.9
   76800            4199.7
   77824            4244.8
   78848            4250.1
   79872            4138.3
   80896            4176.1
   81920            4228.3
   82944            4286.3
   83968            4320.5
   84992            4371.7
   86016            4306.2
   87040            4356.7
   88064            4316.6
   89088            4330.5
   90112            4357.5
   91136            4371.6
   92160            4364.5
   93184            4400.4
   94208            4397.9
   95232            4411.2
   96256            4393.0
   97280            4398.0
   98304            4406.9
   99328            4395.3
   100352            4422.8
   101376            4429.3
   102400            4355.9

[guest13@gpu03 ukletter]$

09-reverse-gpu.cu

[guest13@gpu03 ukletter]$ cat 09-reverse-gpu.cu
#include <stdio.h>
#define N 10

void reverse(int *input, int *output) {
    for(int i=0; i<N; i++) {
        output[i] = input[N-1-i];
    }
    return;
}

void __global__ reverse_gpu(int *input, int *output) {
    int i = blockIdx.x;
    if(i<N)    output[i] = input[N-1-i];
    return;
}

int main(void) {
    int a[N], b[N];
    int *a_dev, *b_dev;
    
    printf("Example of reverse\n");

    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = i;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);

    reverse_gpu<<<N,1>>>(a_dev, b_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(b, b_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N; i++) {
        printf("%d : %d -> %d \n", i, a[i], b[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 09-reverse-gpu.cu
[guest13@gpu03 ukletter]$ ./a.out
Example of reverse
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 : 0 -> 9 
1 : 1 -> 8 
2 : 2 -> 7 
3 : 3 -> 6 
4 : 4 -> 5 
5 : 5 -> 4 
6 : 6 -> 3 
7 : 7 -> 2 
8 : 8 -> 1 
9 : 9 -> 0 
[guest13@gpu03 ukletter]$

10-reverse-gpu-bt.cu

Line(3): N값을 10에서 100으로 변경하였다.
Line(36~39): x축만 존재하는 1차원 연산이다. 1차원 연산이므로 Thread 사이즈는 GPU Core의 수와 같게 설정한다.(3차원 연산이라면 총 GPU Core의 수 / 3 의 소숫점첫째자리 내림값으로 해줘야 한다.) 여기서는 10으로 정했고, 총 연산의 수가 100이므로 Block의 갯수를 Thread의 수로 나눈 10(= 100 / 10)으로, Fixed 된 Thread의 수에 따라 값이 정해진다.

[guest13@gpu03 ukletter]$ cat 10-reverse-gpu-bt.cu 
#include <stdio.h>
#define N 100

void reverse(int *input, int *output) {
    for(int i=0; i<N; i++) {
        output[i] = input[N-1-i];
    }
    return;
}

void __global__ reverse_gpu(int *input, int *output) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i<N)    output[i] = input[N-1-i];
    return;
}

int main(void) {
    int a[N], b[N];
    int *a_dev, *b_dev;
    
    printf("Example of reverse\n");

    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = i;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);

    dim3 bsize, tsize;
    tsize.x = 10;
    bsize.x = N/10;
    reverse_gpu<<<bsize,tsize>>>(a_dev, b_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(b, b_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N; i++) {
        printf("%d - %d : %d \n", i, a[i], b[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 10-reverse-gpu-bt.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of reverse
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 - 0 : 99 
1 - 1 : 98 
2 - 2 : 97 
3 - 3 : 96 
4 - 4 : 95 
5 - 5 : 94 
6 - 6 : 93 

...생략...

95 - 95 : 4 
96 - 96 : 3 
97 - 97 : 2 
98 - 98 : 1 
99 - 99 : 0 
[guest13@gpu03 ukletter]$

11-reverse-gpu-odd.cu & 12-reverse-gpu-odd-correct.cu

Line(3): N값을 100에서 101으로 변경하였다.
Line(74): 메모리에 Garbage 값이 출력된다.

[guest13@gpu03 ukletter]$ cat 11-reverse-gpu-odd.cu 
#include <stdio.h>
#define N 101

void reverse(int *input, int *output) {
    for(int i=0; i<N; i++) {
        output[i] = input[N-1-i];
    }
    return;
}

void __global__ reverse_gpu(int *input, int *output) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i<N)    output[i] = input[N-1-i];
    return;
}

int main(void) {
    int a[N], b[N];
    int *a_dev, *b_dev;
    
    printf("Example of reverse\n");

    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = i;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);

    dim3 bsize, tsize;
    tsize.x = 10;
    bsize.x = N/10;
    reverse_gpu<<<bsize,tsize>>>(a_dev, b_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(b, b_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N;i++) {
    printf("%d - %d : %d \n", i, a[i], b[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ cp /home/example/2012-12-11/11-reverse-gpu-odd.cu ./.
[guest13@gpu03 ukletter]$ nvcc 11-reverse-gpu-odd.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of reverse
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 - 0 : 100 
1 - 1 : 99 
2 - 2 : 98 
3 - 3 : 97 
4 - 4 : 96 
5 - 5 : 95 
6 - 6 : 94 

...생략...

95 - 95 : 5 
96 - 96 : 4 
97 - 97 : 3 
98 - 98 : 2 
99 - 99 : 1 
100 - 100 : 140566019
[guest13@gpu03 ukletter]$

[guest13@gpu03 ukletter]$ cat 12-reverse-gpu-odd-correct.cu 
#include <stdio.h>
#define N 101

void reverse(int *input, int *output) {
    for(int i=0; i<N; i++) {
        output[i] = input[N-1-i];
    }
    return;
}

void __global__ reverse_gpu(int *input, int *output) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if(i<N)    output[i] = input[N-1-i];
    return;
}

int main(void) {
    int a[N], b[N];
    int *a_dev, *b_dev;
    
    printf("Example of reverse\n");

    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = i;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);

    dim3 bsize, tsize;
    tsize.x = 10;
    int tmp = N % 10; 

    if(tmp == 0)    bsize.x = (int) N/10;
    else    bsize.x = (int) N/10 + 1;

    printf(" %d : %d = <<<%d , %d>>>\n", N, bsize.x * tsize.x , bsize.x, tsize.x);
    reverse_gpu<<<bsize,tsize>>>(a_dev, b_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(b, b_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N;i++) {
    printf("%d - %d : %d \n", i, a[i], b[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 12-reverse-gpu-odd-correct.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of reverse
data initialize
allocate GPU memory
upload data to GPU
 101 : 110 = <<<11 , 10>>>
done....
 download result from GPU
0 - 0 : 100 
1 - 1 : 99 
2 - 2 : 98 
3 - 3 : 97 
4 - 4 : 96 
5 - 5 : 95 
6 - 6 : 94 

...생략...

95 - 95 : 5 
96 - 96 : 4 
97 - 97 : 3 
98 - 98 : 2 
99 - 99 : 1 
100 - 100 : 0 
[guest13@gpu03 ukletter]$

13-bt.cu

[guest13@gpu03 ukletter]$ cat 13-bt.cu 
#include <stdio.h>

__global__ void kernel(void) {
    int jump_x = blockDim.x;  // 10
    int jump_y = blockDim.y;  // 10
    int jump_yy = blockDim.x * gridDim.x; // 10 * 4  = 40
    int idx_x = blockIdx.x * (jump_x) + threadIdx.x;  // 0~60
    int idx_y = blockIdx.y * (jump_y) +  threadIdx.y; // 0~40
    int tid = idx_x * (jump_yy) + idx_y; // 0 ~ 2400

//  printf("hello from <<< (%d,%d) , (%d,%d) >>>  \n",
//     blockIdx.x,blockIdx.y, threadIdx.x, threadIdx.y )

    printf("%d : (%d,%d)\n", tid, idx_x, idx_y); 
    return;
}

int main(void) {
    printf("hello first CUDA kernel\n");

    dim3 nBlock, nThread;
    nBlock.x = 6; nThread.x = 5;
    nBlock.y = 4; nThread.y = 5;
    
    printf(" configuration :  <<< (%d , %d),( %d, %d) >>> \n ",
        nBlock.x, nBlock.y, nThread.x, nThread.y);
        
    kernel<<< nBlock , nThread >>>();
    printf("done....\n");

    printf("flush the data from GPU\n");
    cudaDeviceReset();

    return 0;
}

[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 13-bt.cu 
[guest13@gpu03 ukletter]$ ./a.out

...생략...

646 : (21,16)
649 : (21,19)
676 : (22,16)
679 : (22,19)
706 : (23,16)
709 : (23,19)
736 : (24,16)
617 : (20,17)
647 : (21,17)
677 : (22,17)
707 : (23,17)
737 : (24,17)
618 : (20,18)
739 : (24,19)
[guest13@gpu03 ukletter]$

14-gmv-cpu.c

Line(63~71): 2차원 연산수행.
Line(88~92): 삼각함수 라이브러리를 링킹시켜주지 않아 발생한 컴파일에러.
Line(99): gmon.out 통계파일을 생성하기 위해 -pg옵션을 부여한다.
Line(108, 119): -pg 링킹으로 생성된 a.out을 실행하면 gmon.out 파일이 생성된다.
Line(122): gmon.out 파일의 내용을 1.txt로 저장하도록 리다이렉션을 수행함.
Line(177~189): 연산의 양이 적어 거의 0에 가깝게 나타남.
Line(206): a.out 실행파일로 생성된 gmon.out 파일로 통계를 비쥬얼하게 보여주게 하는 명령인데 파이썬 파일에 문제가 있어서 그런지 에러가 발생했다.

[guest13@gpu03 ukletter]$ cat 14-gmv-cpu.c 
// Matrix Vector Multiplication : same as  dgemv fuction on BLAS,cuBLAS
// row major order in C/C++ A[row][col] ~ A[row*rowSize + col]

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define m 1024
#define n 1024  

void datainit(double*, double*, double*, long, long, long, long);
void      gmv(double*, double*, double*, long, long, long, long);
void   result(double*, long);

int main(void) {
    long hA, wA, hX, hY;
    double *A, *X, *Y;
    hA = hY = m;
    wA = hX = n;

    A = (double *)malloc(sizeof(double) * hA * wA);
    X = (double *)malloc(sizeof(double) *  1 * hX);
    Y = (double *)malloc(sizeof(double) * hY *  1);

    printf("start...\n");

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv(A, X, Y, hA, wA, hX, hY);
    result(Y, hY);

    printf("finish...\n");
    return 0;
}

void datainit(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;

    // init A
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            A[row*hA+col] = sin(0.001*col) + cos(0.013*row);
        }
    }

    // init X
    for(row=0; row<hX; row++) {
        X[row] = cos(0.003*row);
    }

    // init Y
    for(row=0; row<hY; row++) {
        Y[row] = 0.0;
    }

    return;
}

void gmv(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            a_ij = A[row * hA + col];
            x_j = X[col]; 
            sum += a_ij * x_j;
        }
        Y[row] = sum;
        sum = 0.0;
    }

    return;
}

void result(double *Y,  long hY) {
    int row;

    printf("reference value\n");
    row = hY-1;
//  for(row=0; row<hY; row++) {
        printf("Y[%d]=\t %f \t \n", row, Y[row]);
//  }
    return;
}

[guest13@gpu03 ukletter]$ gcc 14-gmv-cpu.c 
/tmp/cceLwRB9.o: In function `datainit':
14-gmv-cpu.c:(.text+0x168): undefined reference to `sin'
14-gmv-cpu.c:(.text+0x183): undefined reference to `cos'
14-gmv-cpu.c:(.text+0x1dd): undefined reference to `cos'
collect2: ld returned 1 exit status
[guest13@gpu03 ukletter]$ gcc -lm 14-gmv-cpu.c 
[guest13@gpu03 ukletter]$ ./a.out
start...
reference value
Y[1023]=     -149.162709      
finish...
[guest13@gpu03 ukletter]$ gcc -lm 14-gmv-cpu.c -pg
[guest13@gpu03 ukletter]$ ls
01-simple-nvcc.c          05-simple-vecadd-cpu.cu          12-reverse-gpu-odd-correct.cu
01-simple-nvcc.cu         06-simple-vecadd-gpu.cu          13-bt.cu
02-simple-kernel.c        07-simple-vecadd-gpu-correct.cu  14-gmv-cpu.c
02-simple-kernel.cu       08-simple-vecadd-gpu-bt.cu       15-gmv-gpu.cu
03-simple-kernelprint.cu  09-reverse-gpu.cu                16-gmv-gpu-timer.cu
04-simple-add.cu          10-reverse-gpu-bt.cu             17-gmv-cublas.cu
05-simple-vecadd-cpu.c    11-reverse-gpu-odd.cu            a.out
[guest13@gpu03 ukletter]$ ./a.out
start...
reference value
Y[1023]=     -149.162709      
finish...
[guest13@gpu03 ukletter]$ ls
01-simple-nvcc.c          06-simple-vecadd-gpu.cu          14-gmv-cpu.c
01-simple-nvcc.cu         07-simple-vecadd-gpu-correct.cu  15-gmv-gpu.cu
02-simple-kernel.c        08-simple-vecadd-gpu-bt.cu       16-gmv-gpu-timer.cu
02-simple-kernel.cu       09-reverse-gpu.cu                17-gmv-cublas.cu
03-simple-kernelprint.cu  10-reverse-gpu-bt.cu             a.out
04-simple-add.cu          11-reverse-gpu-odd.cu            gmon.out
05-simple-vecadd-cpu.c    12-reverse-gpu-odd-correct.cu
05-simple-vecadd-cpu.cu   13-bt.cu
[guest13@gpu03 ukletter]$ gprof ./a.out ./gmon.out > 1.txt
[guest13@gpu03 ukletter]$ ls
01-simple-nvcc.c          06-simple-vecadd-gpu.cu          14-gmv-cpu.c
01-simple-nvcc.cu         07-simple-vecadd-gpu-correct.cu  15-gmv-gpu.cu
02-simple-kernel.c        08-simple-vecadd-gpu-bt.cu       16-gmv-gpu-timer.cu
02-simple-kernel.cu       09-reverse-gpu.cu                17-gmv-cublas.cu
03-simple-kernelprint.cu  10-reverse-gpu-bt.cu             1.txt
04-simple-add.cu          11-reverse-gpu-odd.cu            a.out
05-simple-vecadd-cpu.c    12-reverse-gpu-odd-correct.cu    gmon.out
05-simple-vecadd-cpu.cu   13-bt.cu
[guest13@gpu03 ukletter]$ cat 1.txt
Flat profile:

Each sample counts as 0.01 seconds.
  %   cumulative   self              self     total           
 time   seconds   seconds    calls  ms/call  ms/call  name    
 50.28      0.01     0.01        1    10.06    10.06  datainit
 50.28      0.02     0.01        1    10.06    10.06  gmv
  0.00      0.02     0.00        1     0.00     0.00  result

 %         the percentage of the total running time of the
time       program used by this function.

cumulative a running sum of the number of seconds accounted
 seconds   for by this function and those listed above it.

 self      the number of seconds accounted for by this
seconds    function alone.  This is the major sort for this
           listing.

calls      the number of times this function was invoked, if
           this function is profiled, else blank.
 
 self      the average number of milliseconds spent in this
ms/call    function per call, if this function is profiled,
           else blank.

 total     the average number of milliseconds spent in this
ms/call    function and its descendents per call, if this 
           function is profiled, else blank.

name       the name of the function.  This is the minor sort
           for this listing. The index shows the location of
           the function in the gprof listing. If the index is
           in parenthesis it shows where it would appear in
           the gprof listing if it were to be printed.


             Call graph (explanation follows)


granularity: each sample hit covers 2 byte(s) for 49.73% of 0.02 seconds

index % time    self  children    called         name
                                             <spontaneous>
[1]    100.0    0.00    0.02                     main [1]
                0.01    0.00       1/1           datainit [2]
                0.01    0.00       1/1           gmv [3]
                0.00    0.00       1/1           result [4]
-----------------------------------------------
                0.01    0.00       1/1           main [1]
[2]     50.0    0.01    0.00       1             datainit [2]
-----------------------------------------------
                0.01    0.00       1/1           main [1]
[3]     50.0    0.01    0.00       1             gmv [3]
-----------------------------------------------
                0.00    0.00       1/1           main [1]
[4]      0.0    0.00    0.00       1             result [4]
-----------------------------------------------

 This table describes the call tree of the program, and was sorted by
 the total amount of time spent in each function and its children.

...생략...

 were internal to the cycle, and the calls entry for each member shows,
 for that member, how many times it was called from other members of
 the cycle.



Index by function name

   [2] datainit                [3] gmv                     [4] result
[guest13@gpu03 ukletter]$ gprof ./a.out ./gmon.out /home/gprof2dot.py | dot -Tpng -o dotplot.png
/home/gprof2dot.py: incompatible with first gmon file
[guest13@gpu03 ukletter]$

15-gmv-gpu.cu

[guest13@gpu03 ukletter]$ cat 15-gmv-gpu.cu 
// Matrix Vector Multiplication : same as  dgemv fuction on BLAS,cuBLAS
// row major order in C/C++ A[row][col] ~ A[row*rowSize + col]

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define m 4096
#define n 1024

void datainit(double*, double*, double*, long, long, long, long);
void      gmv(double*, double*, double*, long, long, long, long);
void  gmv_gpu(double*, double*, double*, long, long, long, long);
__global__
void gmv_cuda(double*, double*, double*, long, long, long, long);
void   result(double *, long);
void  cudaErr(int i, char* msg);

int main(void) {
    long hA, wA, hX, hY;
    double *A, *X, *Y;
    hA = hY = m;
    wA = hX = n;

    A = (double *)malloc(sizeof(double) * hA * wA); // m*n
    X = (double *)malloc(sizeof(double) *  1 * hX); // m
    Y = (double *)malloc(sizeof(double) * hY *  1); // n

    printf("start...\n");

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv(A, X, Y, hA, wA, hX, hY);
    result(Y, hY);

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv_gpu(A, X, Y, hA, wA, hX, hY);
    result(Y, hY);

    printf("finish...\n");
    return 0;
}

void datainit(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;

    // init A
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            A[row*wA+col] = sin(0.001*col) + cos(0.013*row);
        }
    }

    // init X
    for(row=0; row<hX; row++) {
        X[row] = cos(0.003*row);
    }

    // init Y
    for(row=0; row<hY; row++) {
        Y[row] = 0.0;
    }

    return;
}

__global__
void gmv_cuda(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//<<< A,B>>>  // 1d block 1d thread
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO

    row = blockIdx.x * blockDim.x + threadIdx.x; // unique id

    if(row<hY) {

//  for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            a_ij = A[row * wA + col];
            x_j = X[col];
            sum += a_ij * x_j;
        }
        Y[row] = sum;
//  }

    }
    else{
        return;
    }

    return;
}

void gmv(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO

    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            a_ij = A[row * wA + col];
            x_j = X[col];
            sum += a_ij * x_j;
        }
        Y[row] = sum;
        sum = 0;
    }

    return;
}

void gmv_gpu(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;

    printf("GPU version\n");
    //gmv(A, X, Y, hA, wA, hX, hY);
    //cpu --> gpu clone
    double *A_dev, *X_dev, *Y_dev; //malloc gpu ptr

    size_t A_size, X_size, Y_size;
    A_size = sizeof(double) * hA * wA;
    X_size = sizeof(double) *  1 * hX;
    Y_size = sizeof(double) * hY * 1;

    cudaMalloc((void**)&A_dev, A_size);cudaErr(0,"malloc_A");
    cudaMalloc((void**)&X_dev, X_size);cudaErr(0,"malloc_X");
    cudaMalloc((void**)&Y_dev, Y_size);cudaErr(0,"malloc_Y");

    cudaMemset(A_dev, 0.0, A_size);cudaErr(0,"memset_a");
    cudaMemset(X_dev, 0.0, X_size);cudaErr(0,"memset_x");
    cudaMemset(Y_dev, 0.0, Y_size);cudaErr(0,"memset_Y");

//memcpy
    cudaMemcpy(A_dev, A, A_size, cudaMemcpyHostToDevice);
    cudaErr(0, "memcpy_a");
    cudaMemcpy(X_dev, X, X_size, cudaMemcpyHostToDevice);
    cudaErr(0, "memcpy_x");

    // for == __global__ void (index) & <<<A,B>>>

    dim3 bs, ts;
    ts.x = 128;
    int tmp1;
    tmp1= 1;//
    bs.x = hY/128+tmp1;

    gmv_cuda <<< bs, ts >>> (a_dev, X_dev, Y_dev, hA, wA, hX, hY);
    cudaErr(0, "launcher");

//memcpy
    cudaMemcpy(Y, Y_dev, Y_size, cudaMemcpyDeviceToHost);
    cudaErr(0, "memcpy_Y");
    return;
}

void result(double *Y,  long hY) {
    int row;

    row = hY-1;
    printf("reference value\n");
//  for(row=0; row<hY; row++) {
        printf("Y[%d]=\t %f \t \n", row, Y[row]);
//  }
    return;
}

void cudaErr(int i, char* msg) {
    cudaError_t err;
    err = cudaGetLastError();
    if(int (err)  != 0) {
        printf("cuda :%s %d :%d %s \n", msg, i, err, cudaGetErrorString(err));
    }
    return;
}

[guest13@gpu03 ukletter]$

16-gmv-gpu-timer.cu

[guest13@gpu03 ukletter]$ cat 16-gmv-gpu-timer.cu 
// Matrix Vector Multiplication : same as  dgemv fuction on BLAS,cuBLAS
// row major order in C/C++ A[row][col] ~ A[row*rowSize + col]

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define m 9216
#define n 8192

void datainit(double*, double*, double*, long, long, long, long);
void      gmv(double*, double*, double*, long, long, long, long);
void  gmv_gpu(double*, double*, double*, long, long, long, long);
__global__
void  gmv_cuda(double*, double*, double*, long, long, long, long);
void   result(double *, long);
void cudaErr(int i, char* msg);

int main(void) {
    long hA, wA, hX, hY;
    double *A, *X, *Y;
    hA = hY = m;
    wA = hX = n;

    A = (double *)malloc(sizeof(double) * hA * wA); // m*n
    X = (double *)malloc(sizeof(double) *  1 * hX); // m
    Y = (double *)malloc(sizeof(double) * hY *  1); // n

    printf(" need memory %d KB for A \n", (hA*wA) * 8 / 1024);
    printf(" need memory %d KB for X,\n", hX * 8     / 1024);
    printf(" need memory %d KB for Y,\n", hY * 8     / 1024);
    printf(" need memory %d KB for All \n", (hA * wA + hX + hY) * 8 /1024);

    printf("start...\n");

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv(A, X, Y, hA, wA, hX, hY);
    result(Y,hY);

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv_gpu(A, X, Y, hA, wA, hX, hY);
    result(Y,hY);

    printf("finish...\n");
    return 0;
}

void datainit(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;

    // init A
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            A[row*wA+col] = sin(0.001*col) + cos(0.013*row);
        }
    }

    // init X
    for(row=0; row<hX; row++) {
        X[row] = cos(0.003*row);
    }

    // init Y
    for(row=0; row<hY; row++) {
        Y[row] = 0.0;
    }

    return;
}

__global__
void gmv_cuda(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//<<< A,B>>>  // 1d block 1d thread
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO

    row = blockIdx.x  * blockDim.x + threadIdx.x; // unique id

    if(row<hY) {
//      for(row=0; row<hA; row++) {
            for(col=0; col<wA; col++) {
                a_ij = A[row * wA + col];
                x_j = X[col];
                sum += a_ij * x_j;
            }
            Y[row] = sum;
//      }
    }
    else {
        return;
    }

    return;
}

void gmv(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            a_ij = A[row * wA + col];
            x_j = X[col];
            sum += a_ij * x_j;
        }
        Y[row] = sum;
        sum = 0;
    }

    return;
}

void gmv_gpu(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//  int row, col;
    float ev1_wclock_ms, ev2_wclock_ms, ev3_wclock_ms, ev4_wclock_ms;
    cudaEvent_t ev1_start, ev1_end; // malloc
    cudaEventCreate(&ev1_start);
    cudaEventCreate(&ev1_end);

    cudaEvent_t ev2_start, ev2_end; // upload
    cudaEventCreate(&ev2_start);
    cudaEventCreate(&ev2_end);

    cudaEvent_t ev3_start, ev3_end; // kernel
    cudaEventCreate(&ev3_start);
    cudaEventCreate(&ev3_end);

    cudaEvent_t ev4_start, ev4_end; // download
    cudaEventCreate(&ev4_start);
    cudaEventCreate(&ev4_end);

    printf("GPU version\n");
    //gmv(A, X, Y, hA, wA, hX, hY);
    //cpu --> gpu clone
    double *A_dev, *X_dev, *Y_dev; //malloc gpu ptr

    size_t A_size, X_size, Y_size;
    A_size = sizeof(double) * hA * wA;
    X_size = sizeof(double) *  1 * hX;
    Y_size = sizeof(double) * hY * 1;

    cudaEventRecord(ev1_start, 0);

    cudaMalloc((void**)&A_dev, A_size);cudaErr(0,"malloc_A");
    cudaMalloc((void**)&X_dev, X_size);cudaErr(0,"malloc_X");
    cudaMalloc((void**)&Y_dev, Y_size);cudaErr(0,"malloc_Y");

    cudaMemset(A_dev, 0.0, A_size);cudaErr(0,"memset_a");
    cudaMemset(X_dev, 0.0, X_size);cudaErr(0,"memset_x");
    cudaMemset(Y_dev, 0.0, Y_size);cudaErr(0,"memset_Y");

    cudaEventRecord(ev1_end, 0);
    cudaEventSynchronize(ev1_end);
    cudaEventElapsedTime(&ev1_wclock_ms, ev1_start, ev1_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(ev2_start, 0);
    cudaMemcpy(A_dev, A, A_size, cudaMemcpyHostToDevice);cudaErr(0, "memcpy_a");
    cudaMemcpy(X_dev, X, X_size, cudaMemcpyHostToDevice);cudaErr(0, "memcpy_x");

    cudaEventRecord(ev2_end, 0);
    cudaEventSynchronize(ev2_end);
    cudaEventElapsedTime(&ev2_wclock_ms, ev2_start, ev2_end);
    cudaDeviceSynchronize();

    // for == __global__ void (index) & <<<A,B>>>

    dim3 bs, ts;
    ts.x=128;
    int tmp1;
    tmp1= 1;//
    bs.x = hY/128+tmp1;

    cudaEventRecord(ev3_start, 0);

    gmv_cuda <<< bs, ts >>> (a_dev, X_dev, Y_dev, hA, wA, hX, hY);cudaErr(0,"launcher");

    cudaEventRecord(ev3_end, 0);
    cudaEventSynchronize(ev3_end);
    cudaEventElapsedTime(&ev3_wclock_ms, ev3_start, ev3_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(ev4_start, 0);
    cudaMemcpy(Y, Y_dev, Y_size, cudaMemcpyDeviceToHost); cudaErr(0,"memcpy_Y");

    cudaEventRecord(ev4_end, 0);
    cudaEventSynchronize(ev4_end);
    cudaEventElapsedTime(&ev4_wclock_ms, ev4_start, ev4_end);
    cudaDeviceSynchronize();

    printf(" %8.4f ms for malloc\n",           ev1_wclock_ms);
    printf(" %8.4f ms for upload\n",           ev2_wclock_ms);
    printf(" %8.4f ms for matrixmul kernel\n", ev3_wclock_ms);
    printf(" %8.4f ms for download \n",        ev4_wclock_ms);

    return;
}

void result(double *Y,  long hY) {
    int row;

    row = hY-1;
    printf("reference value\n");
//    for (row = 0; row < hY; row++) {
        printf("Y[%d]=\t %f \t \n",row,Y[row]);

//    }
    return;
}

void cudaErr(int i, char* msg) {
    cudaError_t err;
    err = cudaGetLastError();
    if(int (err)  != 0) {
    printf("cuda :%s %d :%d %s \n",
            msg, i, err, cudaGetErrorString(err));
    }
    return;
}

[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 16-gmv-gpu-timer.cu 
[guest13@gpu03 ukletter]$ ./a.out
 need memory 589824 KB for A 
 need memory 64 KB for X,
 need memory 72 KB for Y,
 need memory 589960 KB for All 
start...
reference value
Y[9215]=     -508.695052      
GPU version
   5.2120 ms for malloc
 178.4099 ms for upload
  63.8990 ms for matrixmul kernel
   0.0593 ms for download 
reference value
Y[9215]=     -508.695052      
finish...
[guest13@gpu03 ukletter]$

17-gmv-cublas.cu

[guest13@gpu03 ukletter]$ cat 17-gmv-cublas.cu 
// Matrix Vector Multiplication : same as  dgemv fuction on BLAS,cuBLAS
// row major order in C/C++ A[row][col] ~ A[row*rowSize + col]

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas_v2.h>
#define m 9216
#define n 8192

void datainit(double*, double*, double*, long, long, long, long);
void      gmv(double*, double*, double*, long, long, long, long);
void  gmv_gpu(double*, double*, double*, long, long, long, long);
__global__
void  gmv_cuda(double*, double*, double*, long, long, long, long);
void gmv_cublas(double*, double*, double*, long, long, long, long);
void   result(double *, long);
void cudaErr(int i, char* msg);

int main(void) {
    long hA, wA, hX, hY;
    double *A, *X, *Y;
    hA = hY = m;
    wA = hX = n;

    A = (double *)malloc(sizeof(double) * hA * wA); // m*n
    X = (double *)malloc(sizeof(double) *  1 * hX); // m
    Y = (double *)malloc(sizeof(double) * hY * 1); // n

    printf(" need memory %d KB for A \n", (hA*wA) * 8 / 1024);
    printf(" need memory %d KB for X,\n",  hX  *8     / 1024 );
    printf(" need memory %d KB for Y,\n",  hY * 8     / 1024);
    printf(" need memory %d KB for All \n", (hA * wA + hX + hY) * 8 /1024 );

    printf("start...\n");

//  datainit(A, X, Y, hA, wA, hX, hY);
//  gmv(A, X, Y, hA, wA, hX, hY);
//  result(Y, hY);

//  datainit(A, X, Y, hA, wA, hX, hY);
//  gmv_gpu(A, X, Y, hA, wA, hX, hY);
//  result(Y, hY);

    datainit(A, X, Y, hA, wA, hX, hY);
    gmv_cublas(A, X, Y, hA, wA, hX, hY);
    result(Y, hY);

    printf("finish...\n");

    free(A);
    free(X);
    free(Y);

    return 0;
}

void datainit(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;

    // init A
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            A[row*wA+col] =  sin(0.001*col) + cos(0.013*row);
        }
    }

    // init X
    for(row=0; row<hX; row++) {
        X[row] = cos(0.003*row);
    }

    // init Y
    for(row=0; row<hY; row++) {
        Y[row] =  0.0;
    }

    return;
}

__global__
void gmv_cuda(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//<<< A,B>>>  // 1d block 1d thread
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO

    row = blockIdx.x  * blockDim.x + threadIdx.x; // unique id

    if(row<hY) {
//      for(row=0; row<hA; row++) {
            for(col=0; col<wA; col++) {
                a_ij = A[row * wA + col];
                x_j = X[col];
                sum += a_ij * x_j;
            }
        Y[row] = sum;
//     }
    }
    else {
        return;
    }

    return;
}

void gmv(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
    int row, col;
    double sum = 0;
    double a_ij = 0;
    double x_j = 0;
    //TODO
    for(row=0; row<hA; row++) {
        for(col=0; col<wA; col++) {
            a_ij = A[row * wA + col];
            x_j = X[col];
            sum += a_ij * x_j;
        }
        Y[row] = sum;
        sum = 0;
    }

    return;
}

void gmv_cublas(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//  int row, col;
    cublasStatus_t status;
    cublasHandle_t handle;
    status = cublasCreate(&handle);
    if(status != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS failed : create \n");  }
    double alpha = 1.0;
    double beta  = 0.0;

    cudaErr(1,"cublas handle");

    float cb1_wclock_ms, cb2_wclock_ms, cb3_wclock_ms, cb4_wclock_ms;
    cudaEvent_t cb1_start, cb1_end; // malloc
    cudaEventCreate(&cb1_start);
    cudaEventCreate(&cb1_end);

    cudaEvent_t cb2_start, cb2_end; // upload
    cudaEventCreate(&cb2_start);
    cudaEventCreate(&cb2_end);

    cudaEvent_t cb3_start, cb3_end; // kernel
    cudaEventCreate(&cb3_start);
    cudaEventCreate(&cb3_end);

    cudaEvent_t cb4_start, cb4_end; // download
    cudaEventCreate(&cb4_start);
    cudaEventCreate(&cb4_end);

    cudaErr(1,"Event Create cb4");

    printf("GPU version\n");
//  gmv(A, X, Y, hA, wA, hX, hY);
//  cpu --> gpu clone
    double *A_cublas, *X_cublas, *Y_cublas; //malloc gpu ptr

    size_t A_size, X_size, Y_size;
    A_size = sizeof(double) * hA * wA;
    X_size = sizeof(double) *  1 * hX;
    Y_size = sizeof(double) * hY * 1;

    cudaEventRecord(cb1_start, 0);

    cudaMalloc((void**)&A_cublas, A_size);
    cudaErr(1, "malloc_A");
    cudaMalloc((void**)&X_cublas, X_size);
    cudaErr(1, "malloc_X");
    cudaMalloc((void**)&Y_cublas, Y_size);
    cudaErr(1, "malloc_Y");

    cudaMemset(A_cublas, 0.0, A_size);
    cudaErr(1, "memset_a");
    cudaMemset(X_cublas, 0.0, X_size);
    cudaErr(1, "memset_x");
    cudaMemset(Y_cublas, 0.0, Y_size);
    cudaErr(1, "memset_Y");

    cudaEventRecord(cb1_end, 0);

    cudaEventSynchronize(cb1_end);
    cudaEventElapsedTime(&cb1_wclock_ms, cb1_start, cb1_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(cb2_start, 0);
    status = cublasSetVector(hA*wA, sizeof(double), A, 1, A_cublas, 1);
    if(status != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS failed : setvector A_cublas \n");  }
    status = cublasSetVector(hX,    sizeof(double), X, 1, X_cublas, 1);
    if(status != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS failed : setvector X_cublas \n");  }
//  cudaMemcpy(A_dev, A, A_size, cudaMemcpyHostToDevice);cudaErr(0, "memcpy_a");
//  cudaMemcpy(X_dev, X, X_size, cudaMemcpyHostToDevice);cudaErr(0, "memcpy_x");

    cudaEventRecord(cb2_end, 0);

    cudaEventSynchronize(cb2_end);
    cudaEventElapsedTime(&cb2_wclock_ms, cb2_start, cb2_end);
    cudaDeviceSynchronize();

    // for == __global__ void (index) & <<<A,B>>>

    dim3 bs, ts;
    ts.x =128;
    int tmp1;
    tmp1 = 1;//
    bs.x = hY/128+tmp1;

    cudaEventRecord(cb3_start, 0);
    status = cublasDgemv(handle, CUBLAS_OP_N, wA, hA, &alpha, A_cublas, wA, X_cublas, 1, &beta, Y_cublas, 1);
    if(status != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS failed :dgemv \n");  }

//  gmv_cuda <<< bs, ts >>> (a_dev, X_dev, Y_dev, hA, wA, hX, hY);cudaErr(0, "launcher");

    cudaEventRecord(cb3_end, 0);

    cudaEventSynchronize(cb3_end);
    cudaEventElapsedTime(&cb3_wclock_ms, cb3_start, cb3_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(cb4_start, 0);
//  cudaMemcpy(Y, Y_dev, Y_size, cudaMemcpyDeviceToHost); cudaErr(0, "memcpy_Y");
    status = cublasGetVector(hY, sizeof(double), Y_cublas, 1, Y, 1);
    if(status != CUBLAS_STATUS_SUCCESS) { printf("CUBLAS failed : getvector Y_cublas \n");  }

    cudaEventRecord(cb4_end, 0);
    cudaEventSynchronize(cb4_end);
    cudaEventElapsedTime(&cb4_wclock_ms, cb4_start, cb4_end);
    cudaDeviceSynchronize();

    printf(" %8.4f ms for cublas malloc\n",     cb1_wclock_ms);
    printf(" %8.4f ms for cublas setvector\n",  cb2_wclock_ms);
    printf(" %8.4f ms for cublas dgemv\n",      cb3_wclock_ms);
    printf(" %8.4f ms for cublas getvector \n", cb4_wclock_ms);

    status = cublasDestroy(handle);
    if (status != CUBLAS_STATUS_SUCCESS) { printf ("CUBLAS failed : destroy \n");  }

    cudaFree(A_cublas);
    cudaFree(X_cublas);
    cudaFree(Y_cublas);

    return;
}

void gmv_gpu(double *A, double *X, double *Y, long hA, long wA, long hX, long hY) {
//    int row, col;
    float ev1_wclock_ms, ev2_wclock_ms, ev3_wclock_ms, ev4_wclock_ms;
    cudaEvent_t ev1_start, ev1_end; // malloc
    cudaEventCreate(&ev1_start);
    cudaEventCreate(&ev1_end);

    cudaEvent_t ev2_start, ev2_end; // upload
    cudaEventCreate(&ev2_start);
    cudaEventCreate(&ev2_end);

    cudaEvent_t ev3_start, ev3_end; // kernel
    cudaEventCreate(&ev3_start);
    cudaEventCreate(&ev3_end);

    cudaEvent_t ev4_start, ev4_end; // download
    cudaEventCreate(&ev4_start);
    cudaEventCreate(&ev4_end);

    printf("GPU version\n");
//  gmv(A, X, Y, hA, wA, hX, hY);
//  cpu --> gpu clone
    double *A_dev, *X_dev, *Y_dev; //malloc gpu ptr

    size_t A_size, X_size, Y_size;
    A_size = sizeof(double) * hA * wA;
    X_size = sizeof(double) *  1 * hX;
    Y_size = sizeof(double) * hY * 1;

    cudaEventRecord(ev1_start, 0);

    cudaMalloc((void**)&A_dev, A_size);cudaErr(0,"malloc_A");
    cudaMalloc((void**)&X_dev, X_size);cudaErr(0,"malloc_X");
    cudaMalloc((void**)&Y_dev, Y_size);cudaErr(0,"malloc_Y");

    cudaMemset(A_dev, 0.0, A_size);cudaErr(0,"memset_a");
    cudaMemset(X_dev, 0.0, X_size);cudaErr(0,"memset_x");
    cudaMemset(Y_dev, 0.0, Y_size);cudaErr(0,"memset_Y");

    cudaEventRecord(ev1_end, 0);

    cudaEventSynchronize(ev1_end);
    cudaEventElapsedTime(&ev1_wclock_ms, ev1_start, ev1_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(ev2_start, 0);
    cudaMemcpy(A_dev, A, A_size, cudaMemcpyHostToDevice);
    cudaErr(0, "memcpy_a");
    cudaMemcpy(X_dev, X, X_size, cudaMemcpyHostToDevice);
    cudaErr(0, "memcpy_x");

    cudaEventRecord(ev2_end, 0);

    cudaEventSynchronize(ev2_end);
    cudaEventElapsedTime(&ev2_wclock_ms, ev2_start, ev2_end);
    cudaDeviceSynchronize();

    // for == __global__ void (index) & <<<A,B>>>

    dim3 bs, ts;
    ts.x = 128;
    int tmp1;
    tmp1 = 1;//
    bs.x = hY/128+tmp1;

    cudaEventRecord(ev3_start, 0);

    gmv_cuda <<< bs, ts >>> (a_dev, X_dev, Y_dev, hA, wA, hX, hY);
    cudaErr(0, "launcher");

    cudaEventRecord(ev3_end, 0);

    cudaEventSynchronize(ev3_end);
    cudaEventElapsedTime(&ev3_wclock_ms, ev3_start, ev3_end);
    cudaDeviceSynchronize();

//memcpy
    cudaEventRecord(ev4_start, 0);
    cudaMemcpy(Y, Y_dev, Y_size, cudaMemcpyDeviceToHost);
    cudaErr(0, "memcpy_Y");

    cudaEventRecord(ev4_end, 0);
    cudaEventSynchronize(ev4_end);
    cudaEventElapsedTime(&ev4_wclock_ms, ev4_start, ev4_end);
    cudaDeviceSynchronize();

    printf(" %8.4f ms for malloc\n",           ev1_wclock_ms);
    printf(" %8.4f ms for upload\n",           ev2_wclock_ms);
    printf(" %8.4f ms for matrixmul kernel\n", ev3_wclock_ms);
    printf(" %8.4f ms for download \n",        ev4_wclock_ms);

    cudaFree(A_dev);
    cudaFree(X_dev);
    cudaFree(Y_dev);

    return;
}

void result(double *Y,  long hY) {
    int row;
    row = hY-1;
    printf("reference value\n");
//  for(row=0; row<hY; row++) {
        printf("Y[%d]=\t %f \t \n", row, Y[row]);
//  }
    return;
}

void cudaErr(int i, char* msg) {
    cudaError_t err;
    err = cudaGetLastError();
    if(int (err) != 0) {
        printf("cuda :%s %d :%d %s \n", msg, i, err, cudaGetErrorString(err));
    }
    return;
}

[guest13@gpu03 ukletter]$

Visual Profiler 실행

[guest13@gpu03 ukletter]$ nvvp
[guest13@gpu03 ukletter]$

저작자표시 비영리 변경금지 (새창열림)

SENS

Programming Note

NVIDIA Korea - GPGPU Programming with CUDA (Part 2)

티스토리툴바