[guest13@notebook13 ~]$ ssh -X gpu
guest13@gpu's password:
Last login: Tue Dec 11 19:56:52 2012 from 192.168.1.37
[guest13@gpu03 ~]$ nvidia-smi
Tue Dec 11 21:45:22 2012
+------------------------------------------------------+
| NVIDIA-SMI 4.304.54 Driver Version: 304.54 |
|-------------------------------+----------------------+----------------------+
| GPU Name | Bus-Id Disp. | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 Tesla S2050 | 0000:02:00.0 Off | Off |
| N/A N/A P0 N/A / N/A | 0% 6MB / 3071MB | 0% Default |
+-------------------------------+----------------------+----------------------+
| 1 Tesla M2050 | 0000:03:00.0 Off | Off |
| N/A N/A P0 N/A / N/A | 0% 6MB / 3071MB | 0% Default |
+-------------------------------+----------------------+----------------------+
+-----------------------------------------------------------------------------+
| Compute processes: GPU Memory |
| GPU PID Process name Usage |
|=============================================================================|
| No running compute processes found |
+-----------------------------------------------------------------------------+
[guest13@gpu03 ~]$
[guest13@gpu03 ~]$ nvcc -bash: nvcc: command not found [guest13@gpu03 ~]$ gcc gcc: no input files [guest13@gpu03 ~]$ module load cuda-5.0-hryu [guest13@gpu03 ~]$ nvcc nvcc fatal : No input files specified; use option --help for more information [guest13@gpu03 ~]$
엔비디아 컴파일러(nvcc) 모듈 로드
01-simple-nvcc.cu
- Line(18): nvcc 컴파일러는 .c파일도 컴파일할 수 있다.
[guest13@gpu03 ~]$ cd ukletter
[guest13@gpu03 ukletter]$ cat 01-simple-nvcc.cu
#include <stdio.h>
int main(int argc, char *argv[]) {
printf("Hello, world\n");
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 01-simple-nvcc.cu
[guest13@gpu03 ukletter]$ ./a.out
Hello, world
[guest13@gpu03 ukletter]$ gcc 01-simple-nvcc.cu
01-simple-nvcc.cu: file not recognized: File format not recognized
collect2: ld returned 1 exit status
[guest13@gpu03 ukletter]$ cp 01-simple-nvcc.cu 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ gcc 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ nvcc 01-simple-nvcc.c
[guest13@gpu03 ukletter]$ ./a.out
Hello, world
[guest13@gpu03 ukletter]$
02-simple-kernel.cu
- Line(24~30): .c파일 안에 CUDA코드가 들어있어 gcc가 컴파일에 실패한다.
- Line(32~38): nvcc는 확장자가 .c인 파일을 컴파일 하지 못한다.(반드시 .cu 파일만 컴파일 가능)
[guest13@gpu03 ukletter]$ cat 02-simple-kernel.cu
#include <stdio.h>
__global__ void kernel(void) {
return;
}
int main(void) {
printf("hello first CUDA kernel\n");
kernel<<< 6 , 4 >>>();
printf("done....\n");
printf("flush the data from GPU\n");
cudaDeviceReset();
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 02-simple-kernel.cu
[guest13@gpu03 ukletter]$ ./a.out
hello first CUDA kernel
done....
flush the data from GPU
[guest13@gpu03 ukletter]$ cp 02-simple-kernel.cu 02-simple-kernel.c
[guest13@gpu03 ukletter]$ gcc 02-simple-kernel.c
02-simple-kernel.c:3: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’
before ‘void’
02-simple-kernel.c: In function ‘main’:
02-simple-kernel.c:9: error: ‘kernel’ undeclared (first use in this function)
02-simple-kernel.c:9: error: (Each undeclared identifier is reported only once
02-simple-kernel.c:9: error: for each function it appears in.)
02-simple-kernel.c:9: error: expected expression before ‘<’ token
[guest13@gpu03 ukletter]$ nvcc 02-simple-kernel.c
02-simple-kernel.c:3: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’
before ‘void’
02-simple-kernel.c: In function ‘main’:
02-simple-kernel.c:9: error: ‘kernel’ undeclared (first use in this function)
02-simple-kernel.c:9: error: (Each undeclared identifier is reported only once
02-simple-kernel.c:9: error: for each function it appears in.)
02-simple-kernel.c:9: error: expected expression before ‘<’ token
[guest13@gpu03 ukletter]$
03-simple-kernelprint.cu
- Line(19~20): Line(5)의 printf()함수를 GPU 내부에서 처리하지 못하여서 에러발생(옛날에는 별도의 에뮬레이터가 필요)
- Line(21): nvcc에 -arch=sm_20 옵션부여(가장 저사양으로 20레벨로 컴파일하는 옵션)
- Line(26~49): GPU에서 병렬적으로 처리되는 것을 보여준다. Block Index는 무작위지만, Block 내에서 Thread Index는 어느 정도 순차적 나타난다.
[guest13@gpu03 ukletter]$ cat 03-simple-kernelprint.cu
#include <stdio.h>
__global__ void kernel(void) {
printf("hello from <<< %d , %d >>> \n", blockIdx.x, threadIdx.x);
return;
}
int main(void) {
printf("hello first CUDA kernel\n");
kernel<<< 6 , 4 >>>();
printf("done....\n");
printf("flush the data from GPU\n");
cudaDeviceReset();
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 03-simple-kernelprint.cu
03-simple-kernelprint.cu(4): error: calling a __host__ function("printf") from a __global__ function("kernel") is not allowed
1 error detected in the compilation of "/tmp/tmpxft_0000139d_00000000-6_03-simple-kernelprint.cpp1.ii".
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu
[guest13@gpu03 ukletter]$ ./a.out
hello first CUDA kernel
done....
flush the data from GPU
hello from <<< 1 , 0 >>>
hello from <<< 1 , 1 >>>
hello from <<< 1 , 2 >>>
hello from <<< 1 , 3 >>>
hello from <<< 5 , 0 >>>
hello from <<< 5 , 1 >>>
hello from <<< 5 , 2 >>>
hello from <<< 5 , 3 >>>
hello from <<< 4 , 0 >>>
hello from <<< 4 , 1 >>>
hello from <<< 4 , 2 >>>
hello from <<< 4 , 3 >>>
hello from <<< 0 , 0 >>>
hello from <<< 0 , 1 >>>
hello from <<< 0 , 2 >>>
hello from <<< 0 , 3 >>>
hello from <<< 3 , 0 >>>
hello from <<< 3 , 1 >>>
hello from <<< 3 , 2 >>>
hello from <<< 3 , 3 >>>
hello from <<< 2 , 0 >>>
hello from <<< 2 , 1 >>>
hello from <<< 2 , 2 >>>
hello from <<< 2 , 3 >>>
[guest13@gpu03 ukletter]$ vi 03-simple-kernelprint.cu
#include <stdio.h>
__global__ void kernel(void) {
printf("hello from <<< %d , %d >>> \n", blockIdx.x, threadIdx.x);
return;
}
int main(void) {
printf("hello first CUDA kernel\n");
kernel<<< 1000 , 1000 >>>();
printf("done....\n");
printf("flush the data from GPU\n");
cudaDeviceReset();
return 0;
}
:wq
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu
[guest13@gpu03 ukletter]$ ./a.out
...생략...
hello from <<< 998 , 24 >>>
hello from <<< 998 , 25 >>>
hello from <<< 998 , 26 >>>
hello from <<< 998 , 27 >>>
hello from <<< 998 , 28 >>>
hello from <<< 998 , 29 >>>
hello from <<< 998 , 30 >>>
hello from <<< 998 , 31 >>>
[guest13@gpu03 ukletter]$ vi 03-simple-kernelprint.cu
#include <stdio.h>
__global__ void kernel(void) {
printf("hello from <<< %d , %d >>> \n", blockIdx.x, threadIdx.x);
return;
}
int main(void) {
printf("hello first CUDA kernel\n");
kernel<<< 14 , 32 >>>(); /* 14 blocks per grid, 32 threads(number of GPU Cores) per block */
printf("done....\n");
printf("flush the data from GPU\n");
cudaDeviceReset();
return 0;
}
:wq
[guest13@gpu03 ukletter]$ nvcc -arch=sm_20 03-simple-kernelprint.cu
[guest13@gpu03 ukletter]$ ./a.out
...생략...
hello from <<< 2 , 24 >>>
hello from <<< 2 , 25 >>>
hello from <<< 2 , 26 >>>
hello from <<< 2 , 27 >>>
hello from <<< 2 , 28 >>>
hello from <<< 2 , 29 >>>
hello from <<< 2 , 30 >>>
hello from <<< 2 , 31 >>>
[guest13@gpu03 ukletter]$
04-simple-add.cu
- Line(15): Block 갯수와 Thread 갯수는 그냥 1로 설정하였다. 이 예제는 GPU에서 연산처리되는 것만 확인한다.
[guest13@gpu03 ukletter]$ cat 04-simple-add.cu
#include <stdio.h>
__global__ void add(int a, int b, int *c) {
*c = a + b;
return;
}
int main(void) {
int c;
int *dev_c;
printf("Example of add\n");
cudaMalloc( (void**)&dev_c, sizeof(int) );
add<<< 1 , 1 >>>(3, 7, dev_c);
printf("done....\n");
printf("download the result from GPU\n");
cudaMemcpy( &c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree( dev_c );
printf("3 + 7 = %d\n", c);
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 04-simple-add.cu
[guest13@gpu03 ukletter]$ ./a.out
Example of add
done....
download the result from GPU
3 + 7 = 10
[guest13@gpu03 ukletter]$
05-simple-vecadd-cpu.cu
- Line(33~40): for loop 안에 int i; 선언 부분때문에 에러가 발생했다.
- Line(41): gcc에 -std=c99 옵션을 부여하여 정상적으로 컴파일을 완료하였다.
[guest13@gpu03 ukletter]$ cat 05-simple-vecadd-cpu.cu
#include <stdio.h>
#define N 10
void add(int *a, int *b, int *c) {
for(int i=0; i<N; i++) {
c[i] = a[i] + b[i];
}
return;
}
int main(void) {
int a[N], b[N], c[N];
printf("Example of Vector ADD\n");
printf("data initialize\n");
for(int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i ;
}
add(a, b, c);
printf("done....\n");
for(int i=0; i<N; i++) {
printf("%d + %d = %d \n", a[i], b[i], c[i]);
}
return 0;
}
[guest13@gpu03 ukletter]$ cp 05-simple-vecadd-cpu.cu 05-simple-vecadd-cpu.c
[guest13@gpu03 ukletter]$ gcc 05-simple-vecadd-cpu.c
05-simple-vecadd-cpu.c: In function ‘add’:
05-simple-vecadd-cpu.c:6: error: ‘for’ loop initial declarations are only allowed in C99 mode
05-simple-vecadd-cpu.c:6: note: use option -std=c99 or -std=gnu99 to compile your code
05-simple-vecadd-cpu.c: In function ‘main’:
05-simple-vecadd-cpu.c:18: error: ‘for’ loop initial declarations are only allowed in C99 mode
05-simple-vecadd-cpu.c:26: error: redefinition of ‘i’
05-simple-vecadd-cpu.c:18: note: previous definition of ‘i’ was here
05-simple-vecadd-cpu.c:26: error: ‘for’ loop initial declarations are only allowed in C99 mode
[guest13@gpu03 ukletter]$ gcc -std=c99 05-simple-vecadd-cpu.c
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
done....
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
[guest13@gpu03 ukletter]$
06-simple-vecadd-gpu.cu & 07-simple-vecadd-gpu-correct.cu
- Line(38): Parameter를 동적할당된 포인터로 써야한다.
- Line(42): GPU에서 연산이 끝난 후, Device에서 Host쪽으로 메모리카피가 이루어져야 한다.
- Line(57~66): 각각 비정상결과출력과 정상결과출력의 모습이다.
[guest13@gpu03 ukletter]$ cat 06-simple-vecadd-gpu.cu
#include <stdio.h>
#define N 10
void vecAdd(int *a, int *b, int *c) {
for(int i=0; i<N; i++) {
c[i] = a[i] + b[i];
}
return;
}
void __global__ vecAdd_gpu(int *a, int *b, int *c) {
int i = blockIdx.x;
if(i<N) c[i] = a[i] + b[i];
return;
}
int main(void) {
int a[N], b[N], c[N];
int *a_dev, *b_dev, *c_dev;
printf("Example of Vector ADD\n");
printf("data initialize\n");
for(int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i;
}
printf("allocate GPU memory\n");
cudaMalloc((void**)&a_dev, sizeof(int)*N);
cudaMalloc((void**)&b_dev, sizeof(int)*N);
cudaMalloc((void**)&c_dev, sizeof(int)*N);
printf("upload data to GPU\n");
cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);
vecAdd_gpu<<<n,1>>>(a, b, c);
printf("done....\n");
printf(" download result from GPU\n");
cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyHostToDevice);
for(int i=0; i<N; i++) {
printf("%d + %d = %d \n", a[i], b[i], c[i]);
}
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 06-simple-vecadd-gpu.cu
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
download result from GPU
0 + 0 = 1
-1 + 1 = 0
-2 + 4 = -815708523
-3 + 9 = 48
-4 + 16 = 4197904
-5 + 25 = 0
-6 + 36 = 0
-7 + 49 = 0
-8 + 64 = 0
-9 + 81 = 0
[guest13@gpu03 ukletter]$
[guest13@gpu03 ukletter]$ cat 07-simple-vecadd-gpu-correct.cu
#include <stdio.h>
#define N 10
void vecAdd(int *a, int *b, int *c) {
for(int i=0; i<N; i++) {
c[i] = a[i] + b[i];
}
return;
}
void __global__ vecAdd_gpu(int *a, int *b, int *c){
int i = blockIdx.x;
if(i<N) c[i] = a[i] + b[i];
return;
}
int main(void) {
int a[N], b[N], c[N];
int *a_dev, *b_dev, *c_dev;
printf("Example of Vector ADD\n");
printf("data initialize\n");
for(int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i ;
}
printf("allocate GPU memory\n");
cudaMalloc((void**)&a_dev, sizeof(int)*N);
cudaMalloc((void**)&b_dev, sizeof(int)*N);
cudaMalloc((void**)&c_dev, sizeof(int)*N);
printf("upload data to GPU\n");
cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);
vecAdd_gpu<<<n,1>>>(a_dev, b_dev, c_dev);
printf("done....\n");
printf(" download result from GPU\n");
cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
for(int i=0; i<N; i++) {
printf("%d + %d = %d \n", a[i], b[i], c[i]);
}
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 07-simple-vecadd-gpu-correct.cu
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
download result from GPU
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
-8 + 64 = 56
-9 + 81 = 72
[guest13@gpu03 ukletter]$
08-simple-vecadd-gpu-bt.cu
- Line(3): N값을 10에서 100으로 변경하였다.
- Line(38~41): x축만 존재하는 1차원 연산이다. 1차원 연산이므로 Thread 사이즈는 GPU Core의 수와 같게 설정한다.(3차원 연산이라면 총 GPU Core의 수 / 3 의 소숫점첫째자리 내림값으로 해줘야 한다.) 여기서는 10으로 정했고, 총 연산의 수가 100이므로 Block의 갯수를 Thread의 수로 나눈 10(= 100 / 10)으로, Fixed 된 Thread의 수에 따라 값이 정해진다.
[guest13@gpu03 ukletter]$ cat 08-simple-vecadd-gpu-bt.cu
#include <stdio.h>
#define N 100
void vecAdd(int *a, int *b, int *c) {
for(int i=0; i<N; i++) {
c[i] = a[i] + b[i];
}
return;
}
void __global__ vecAdd_gpu(int *a, int *b, int *c) {
int i = blockIdx.x * blockDim.x + threadIdx.x ;
if(i<N) c[i] = a[i] + b[i];
return;
}
int main(void) {
int a[N], b[N], c[N];
int *a_dev, *b_dev, *c_dev;
printf("Example of Vector ADD\n");
printf("data initialize\n");
for(int i=0; i<N; i++) {
a[i] = -i;
b[i] = i * i ;
}
printf("allocate GPU memory\n");
cudaMalloc((void**)&a_dev, sizeof(int)*N);
cudaMalloc((void**)&b_dev, sizeof(int)*N);
cudaMalloc((void**)&c_dev, sizeof(int)*N);
printf("upload data to GPU\n");
cudaMemcpy(a_dev, a, sizeof(int)*N, cudaMemcpyHostToDevice);
cudaMemcpy(b_dev, b, sizeof(int)*N, cudaMemcpyHostToDevice);
dim3 bsize, tsize;
tsize.x = 10;
bsize.x = N/10;
vecAdd_gpu<<<bsize,tsize>>>(a_dev, b_dev, c_dev);
printf("done....\n");
printf(" download result from GPU\n");
cudaMemcpy(c, c_dev, sizeof(int)*N, cudaMemcpyDeviceToHost);
for(int i=0; i<N; i++) {
printf("%d + %d = %d \n", a[i], b[i], c[i]);
}
return 0;
}
[guest13@gpu03 ukletter]$ nvcc 08-simple-vecadd-gpu-bt.cu
[guest13@gpu03 ukletter]$ ./a.out
Example of Vector ADD
data initialize
allocate GPU memory
upload data to GPU
done....
download result from GPU
0 + 0 = 0
-1 + 1 = 0
-2 + 4 = 2
-3 + 9 = 6
-4 + 16 = 12
-5 + 25 = 20
-6 + 36 = 30
-7 + 49 = 42
...생략...
-93 + 8649 = 8556
-94 + 8836 = 8742
-95 + 9025 = 8930
-96 + 9216 = 9120
-97 + 9409 = 9312
-98 + 9604 = 9506
-99 + 9801 = 9702
[guest13@gpu03 ukletter]$