NVIDIA Korea - GPGPU Programming with CUDA (Part 1)

2012. 12. 11. 20:04

[guest13@notebook13 ~]$ ssh -X gpu
guest13@gpu's password: 
Last login: Tue Dec 11 19:56:52 2012 from 192.168.1.37
[guest13@gpu03 ~]$ nvidia-smi
Tue Dec 11 21:45:22 2012       
+------------------------------------------------------+                       
| NVIDIA-SMI 4.304.54   Driver Version: 304.54         |                       
|-------------------------------+----------------------+----------------------+
| GPU  Name                     | Bus-Id        Disp.  | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap| Memory-Usage         | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla S2050              | 0000:02:00.0     Off |                  Off |
| N/A   N/A    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |
+-------------------------------+----------------------+----------------------+
|   1  Tesla M2050              | 0000:03:00.0     Off |                  Off |
| N/A   N/A    P0    N/A /  N/A |   0%    6MB / 3071MB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Compute processes:                                               GPU Memory |
|  GPU       PID  Process name                                     Usage      |
|=============================================================================|
|  No running compute processes found                                         |
+-----------------------------------------------------------------------------+
[guest13@gpu03 ~]$

[guest13@gpu03 ~]$ nvcc
-bash: nvcc: command not found
[guest13@gpu03 ~]$ gcc
gcc: no input files
[guest13@gpu03 ~]$ module load cuda-5.0-hryu
[guest13@gpu03 ~]$ nvcc
nvcc fatal   : No input files specified; use option --help for more information
[guest13@gpu03 ~]$

엔비디아 컴파일러(nvcc) 모듈 로드

01-simple-nvcc.cu

Line(18): nvcc 컴파일러는 .c파일도 컴파일할 수 있다.

[guest13@gpu03 ~]$ cd ukletter
[guest13@gpu03 ukletter]$ cat 01-simple-nvcc.cu
#include <stdio.h>

int main(int argc, char *argv[]) {
    printf("Hello, world\n");
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 01-simple-nvcc.cu
[guest13@gpu03 ukletter]$ ./a.out
Hello, world
[guest13@gpu03 ukletter]$ gcc 01-simple-nvcc.cu 
01-simple-nvcc.cu: file not recognized: File format not recognized
collect2: ld returned 1 exit status
[guest13@gpu03 ukletter]$ cp 01-simple-nvcc.cu 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ gcc 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ nvcc 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ ./a.out
Hello, world
[guest13@gpu03 ukletter]$

02-simple-kernel.cu

Line(24~30): .c파일 안에 CUDA코드가 들어있어 gcc가 컴파일에 실패한다.
Line(32~38): nvcc는 확장자가 .c인 파일을 컴파일 하지 못한다.(반드시 .cu 파일만 컴파일 가능)

[guest13@gpu03 ukletter]$ cat 02-simple-kernel.cu
#include <stdio.h>

__global__ void kernel(void) {
    return;
}

int main(void) {
    printf("hello first CUDA kernel\n");
    kernel<<< 6 , 4 >>>();
    printf("done....\n");
    printf("flush the data from GPU\n");
    cudaDeviceReset();
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 02-simple-kernel.cu 
[guest13@gpu03 ukletter]$ ./a.out
hello first CUDA kernel
done....
flush the data from GPU
[guest13@gpu03 ukletter]$ cp 02-simple-kernel.cu 02-simple-kernel.c
[guest13@gpu03 ukletter]$ gcc 02-simple-kernel.c
02-simple-kernel.c:3: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’
before ‘void’
02-simple-kernel.c: In function ‘main’:
02-simple-kernel.c:9: error: ‘kernel’ undeclared (first use in this function)
02-simple-kernel.c:9: error: (Each undeclared identifier is reported only once
02-simple-kernel.c:9: error: for each function it appears in.)
02-simple-kernel.c:9: error: expected expression before ‘<’ token
[guest13@gpu03 ukletter]$ nvcc 02-simple-kernel.c
02-simple-kernel.c:3: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’
before ‘void’
02-simple-kernel.c: In function ‘main’:
02-simple-kernel.c:9: error: ‘kernel’ undeclared (first use in this function)
02-simple-kernel.c:9: error: (Each undeclared identifier is reported only once
02-simple-kernel.c:9: error: for each function it appears in.)
02-simple-kernel.c:9: error: expected expression before ‘<’ token
[guest13@gpu03 ukletter]$

03-simple-kernelprint.cu

Line(19~20): Line(5)의 printf()함수를 GPU 내부에서 처리하지 못하여서 에러발생(옛날에는 별도의 에뮬레이터가 필요)
Line(21): nvcc에 -arch=sm_20 옵션부여(가장 저사양으로 20레벨로 컴파일하는 옵션)
Line(26~49): GPU에서 병렬적으로 처리되는 것을 보여준다. Block Index는 무작위지만, Block 내에서 Thread Index는 어느 정도 순차적 나타난다.

[guest13@gpu03 ukletter]$ cat 03-simple-kernelprint.cu 
#include <stdio.h>

__global__ void kernel(void) {
    printf("hello from <<< %d , %d >>>  \n", blockIdx.x, threadIdx.x);
    return;
}

int main(void) {
    printf("hello first CUDA kernel\n");
    kernel<<< 6 , 4 >>>();
    printf("done....\n");
    printf("flush the data from GPU\n");
    cudaDeviceReset();
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 03-simple-kernelprint.cu 
03-simple-kernelprint.cu(4): error: calling a __host__ function("printf") from a __global__ function("kernel") is not allowed
1 error detected in the compilation of "/tmp/tmpxft_0000139d_00000000-6_03-simple-kernelprint.cpp1.ii".
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu 
[guest13@gpu03 ukletter]$ ./a.out
hello first CUDA kernel
done....
flush the data from GPU
hello from <<< 1 , 0 >>>  
hello from <<< 1 , 1 >>>  
hello from <<< 1 , 2 >>>  
hello from <<< 1 , 3 >>>  
hello from <<< 5 , 0 >>>  
hello from <<< 5 , 1 >>>  
hello from <<< 5 , 2 >>>  
hello from <<< 5 , 3 >>>  
hello from <<< 4 , 0 >>>  
hello from <<< 4 , 1 >>>  
hello from <<< 4 , 2 >>>  
hello from <<< 4 , 3 >>>  
hello from <<< 0 , 0 >>>  
hello from <<< 0 , 1 >>>  
hello from <<< 0 , 2 >>>  
hello from <<< 0 , 3 >>>  
hello from <<< 3 , 0 >>>  
hello from <<< 3 , 1 >>>  
hello from <<< 3 , 2 >>>  
hello from <<< 3 , 3 >>>  
hello from <<< 2 , 0 >>>  
hello from <<< 2 , 1 >>>  
hello from <<< 2 , 2 >>>  
hello from <<< 2 , 3 >>>
[guest13@gpu03 ukletter]$ vi 03-simple-kernelprint.cu 
#include <stdio.h>

__global__ void kernel(void) {
    printf("hello from <<< %d , %d >>>  \n", blockIdx.x, threadIdx.x);
    return;
}

int main(void) {
    printf("hello first CUDA kernel\n");
    kernel<<< 1000 , 1000 >>>();
    printf("done....\n");
    printf("flush the data from GPU\n");
    cudaDeviceReset();
    return 0;
}
:wq
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu 
[guest13@gpu03 ukletter]$ ./a.out

...생략...

hello from <<< 998 , 24 >>>  
hello from <<< 998 , 25 >>>  
hello from <<< 998 , 26 >>>  
hello from <<< 998 , 27 >>>  
hello from <<< 998 , 28 >>>  
hello from <<< 998 , 29 >>>  
hello from <<< 998 , 30 >>>  
hello from <<< 998 , 31 >>> 
[guest13@gpu03 ukletter]$ vi 03-simple-kernelprint.cu
#include <stdio.h>

__global__ void kernel(void) {
    printf("hello from <<< %d , %d >>>  \n", blockIdx.x, threadIdx.x);
    return;
}

int main(void) {
    printf("hello first CUDA kernel\n");
    kernel<<< 14 , 32 >>>(); /* 14 blocks per grid, 32 threads(number of GPU Cores) per block */
    printf("done....\n");
    printf("flush the data from GPU\n");
    cudaDeviceReset();
    return 0;
}
:wq
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu 
[guest13@gpu03 ukletter]$ ./a.out

...생략...

hello from <<< 2 , 24 >>>  
hello from <<< 2 , 25 >>>  
hello from <<< 2 , 26 >>>  
hello from <<< 2 , 27 >>>  
hello from <<< 2 , 28 >>>  
hello from <<< 2 , 29 >>>  
hello from <<< 2 , 30 >>>  
hello from <<< 2 , 31 >>>
[guest13@gpu03 ukletter]$

04-simple-add.cu

Line(15): Block 갯수와 Thread 갯수는 그냥 1로 설정하였다. 이 예제는 GPU에서 연산처리되는 것만 확인한다.

[guest13@gpu03 ukletter]$ cat 04-simple-add.cu
#include <stdio.h>

__global__ void add(int a, int b, int *c) {
    *c = a + b;
    return;
}

int main(void) {
    int c;
    int *dev_c;
    printf("Example of add\n");

    cudaMalloc( (void**)&dev_c, sizeof(int) );
    add<<< 1 , 1 >>>(3, 7, dev_c);
    printf("done....\n");
    
    printf("download the result from GPU\n");
    cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree( dev_c );

    printf("3 + 7 = %d\n", c);
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 04-simple-add.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of add
done....
download the result from GPU
3 + 7 = 10
[guest13@gpu03 ukletter]$

05-simple-vecadd-cpu.cu

Line(33~40): for loop 안에 int i; 선언 부분때문에 에러가 발생했다.
Line(41): gcc에 -std=c99 옵션을 부여하여 정상적으로 컴파일을 완료하였다.

[guest13@gpu03 ukletter]$ cat 05-simple-vecadd-cpu.cu 
#include <stdio.h>
#define N 10

void add(int *a, int *b, int *c) {
    for(int i=0; i<N; i++) {
        c[i] = a[i] + b[i];
    }
    return;
}

int main(void) {
    int a[N], b[N], c[N];
    
    printf("Example of Vector ADD\n");
    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i ;
    }

    add(a, b, c);
    printf("done....\n");

    for(int i=0; i<N; i++) {
        printf("%d + %d = %d \n", a[i], b[i], c[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ cp 05-simple-vecadd-cpu.cu 05-simple-vecadd-cpu.c
[guest13@gpu03 ukletter]$ gcc 05-simple-vecadd-cpu.c
05-simple-vecadd-cpu.c: In function ‘add’:
05-simple-vecadd-cpu.c:6: error: ‘for’ loop initial declarations are only allowed in C99 mode
05-simple-vecadd-cpu.c:6: note: use option -std=c99 or -std=gnu99 to compile your code
05-simple-vecadd-cpu.c: In function ‘main’:
05-simple-vecadd-cpu.c:18: error: ‘for’ loop initial declarations are only allowed in C99 mode
05-simple-vecadd-cpu.c:26: error: redefinition of ‘i’
05-simple-vecadd-cpu.c:18: note: previous definition of ‘i’ was here
05-simple-vecadd-cpu.c:26: error: ‘for’ loop initial declarations are only allowed in C99 mode
[guest13@gpu03 ukletter]$ gcc -std=c99 05-simple-vecadd-cpu.c
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
done....
0 + 0 = 0 
-1 + 1 = 0 
-2 + 4 = 2 
-3 + 9 = 6 
-4 + 16 = 12 
-5 + 25 = 20 
-6 + 36 = 30 
-7 + 49 = 42 
-8 + 64 = 56 
-9 + 81 = 72 
[guest13@gpu03 ukletter]$

06-simple-vecadd-gpu.cu & 07-simple-vecadd-gpu-correct.cu

Line(38): Parameter를 동적할당된 포인터로 써야한다.
Line(42): GPU에서 연산이 끝난 후, Device에서 Host쪽으로 메모리카피가 이루어져야 한다.
Line(57~66): 각각 비정상결과출력과 정상결과출력의 모습이다.

[guest13@gpu03 ukletter]$ cat 06-simple-vecadd-gpu.cu 
#include <stdio.h>
#define N 10

void vecAdd(int *a, int *b, int *c) {
    for(int i=0; i<N; i++) {
        c[i] = a[i] + b[i];
    }
    return;
}

void __global__ vecAdd_gpu(int *a, int *b, int *c) {
    int i = blockIdx.x;
    if(i<N)    c[i] = a[i] + b[i];
    return;
}

int main(void) {
    int a[N], b[N], c[N];
    int *a_dev, *b_dev, *c_dev;
    
    printf("Example of Vector ADD\n");
    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);
    cudaMalloc((void**)&c_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);

    vecAdd_gpu<<<n,1>>>(a, b, c);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyHostToDevice);
    for(int i=0; i<N; i++) {
        printf("%d + %d = %d \n", a[i], b[i], c[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 06-simple-vecadd-gpu.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 + 0 = 1 
-1 + 1 = 0 
-2 + 4 = -815708523 
-3 + 9 = 48 
-4 + 16 = 4197904 
-5 + 25 = 0 
-6 + 36 = 0 
-7 + 49 = 0 
-8 + 64 = 0 
-9 + 81 = 0
[guest13@gpu03 ukletter]$

[guest13@gpu03 ukletter]$ cat 07-simple-vecadd-gpu-correct.cu
#include <stdio.h>
#define N 10

void vecAdd(int *a, int *b, int *c) {
    for(int i=0; i<N; i++) {
        c[i] = a[i] + b[i];
    }
    return;
}

void __global__ vecAdd_gpu(int *a, int *b, int *c){
    int i = blockIdx.x;
    if(i<N)    c[i] = a[i] + b[i];
    return;
}

int main(void) {
    int a[N], b[N], c[N];
    int *a_dev, *b_dev, *c_dev;
    
    printf("Example of Vector ADD\n");
    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i ;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);
    cudaMalloc((void**)&c_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);

    vecAdd_gpu<<<n,1>>>(a_dev, b_dev, c_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N; i++) {
        printf("%d + %d = %d \n", a[i], b[i], c[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 07-simple-vecadd-gpu-correct.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 + 0 = 0 
-1 + 1 = 0 
-2 + 4 = 2 
-3 + 9 = 6 
-4 + 16 = 12 
-5 + 25 = 20 
-6 + 36 = 30 
-7 + 49 = 42 
-8 + 64 = 56 
-9 + 81 = 72
[guest13@gpu03 ukletter]$

08-simple-vecadd-gpu-bt.cu

Line(3): N값을 10에서 100으로 변경하였다.
Line(38~41): x축만 존재하는 1차원 연산이다. 1차원 연산이므로 Thread 사이즈는 GPU Core의 수와 같게 설정한다.(3차원 연산이라면 총 GPU Core의 수 / 3 의 소숫점첫째자리 내림값으로 해줘야 한다.) 여기서는 10으로 정했고, 총 연산의 수가 100이므로 Block의 갯수를 Thread의 수로 나눈 10(= 100 / 10)으로, Fixed 된 Thread의 수에 따라 값이 정해진다.

[guest13@gpu03 ukletter]$ cat 08-simple-vecadd-gpu-bt.cu 
#include <stdio.h>
#define N 100

void vecAdd(int *a, int *b, int *c) {
    for(int i=0; i<N; i++) {
        c[i] = a[i] + b[i];
    }
    return;
}

void __global__ vecAdd_gpu(int *a, int *b, int *c) {
    int i = blockIdx.x * blockDim.x + threadIdx.x ;
    if(i<N)    c[i] = a[i] + b[i];
    return;
}

int main(void) {
    int a[N], b[N], c[N];
    int *a_dev, *b_dev, *c_dev;
    
    printf("Example of Vector ADD\n");
    printf("data initialize\n");
    for(int i=0; i<N; i++) {
        a[i] = -i;
        b[i] = i * i ;
    }

    printf("allocate GPU memory\n");
    cudaMalloc((void**)&a_dev, sizeof(int)*N);
    cudaMalloc((void**)&b_dev, sizeof(int)*N);
    cudaMalloc((void**)&c_dev, sizeof(int)*N);

    printf("upload data to GPU\n");
    cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
    cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);
    
    dim3 bsize, tsize;
    tsize.x = 10;
    bsize.x = N/10;
    vecAdd_gpu<<<bsize,tsize>>>(a_dev, b_dev, c_dev);
    printf("done....\n");

    printf(" download result from GPU\n");
    cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
    for(int i=0; i<N; i++) {
        printf("%d + %d = %d \n", a[i], b[i], c[i]);
    }
    return 0;
}

[guest13@gpu03 ukletter]$ nvcc 08-simple-vecadd-gpu-bt.cu 
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
 download result from GPU
0 + 0 = 0 
-1 + 1 = 0 
-2 + 4 = 2 
-3 + 9 = 6 
-4 + 16 = 12 
-5 + 25 = 20 
-6 + 36 = 30 
-7 + 49 = 42 

...생략...

-93 + 8649 = 8556 
-94 + 8836 = 8742 
-95 + 9025 = 8930 
-96 + 9216 = 9120 
-97 + 9409 = 9312 
-98 + 9604 = 9506 
-99 + 9801 = 9702
[guest13@gpu03 ukletter]$

저작자표시 비영리 변경금지 (새창열림)

SENS

Programming Note

NVIDIA Korea - GPGPU Programming with CUDA (Part 1)

티스토리툴바