image_proc_samples/resize.cpp at main · daviddrell/image_proc_samples · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#include "resize.h"
#include "stdio.h"
#include <math.h>
#include <malloc.h>
#include <limits.h>
#include <float.h>
#include "memory.h"
#include "npp.h"


// This function is a cuda program compiled with the nvidia cuda compiler.
// Once built into a library, it can be called from a c program or any program that can call a c library.
// The calling program does not need to know anything about cuda programming.

int resize_Cuda(float* src, float* dst, __int32 src_width, __int32 src_height, __int32 dst_width, __int32 dst_height)
{
    float* dev_src = 0;
    float* dev_dst = 0;
    cudaError_t cudaStatus;

    // Allocate GPU buffers for the two images (one input, one output)
    size_t memsizeSrc = (size_t)src_width * (size_t)src_height * (size_t)sizeof(float);
    cudaStatus = cudaMalloc((void**)&dev_src, memsizeSrc);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed src!");
        goto Error;
    }

    size_t memsizeDst = (size_t)dst_width * (size_t)dst_height * (size_t)sizeof(float);
    cudaStatus = cudaMalloc((void**)&dev_dst, memsizeDst);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed dst!");
        goto Error;
    }

    // set all destination values to zero
    cudaStatus = cudaMemset((void*)dev_dst, 0, memsizeDst);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemset failed!");
        goto Error;
    }

    // Copy input image from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_src, src, memsizeSrc, cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }


    NppiRect srcROI;
    srcROI.x = 0;
    srcROI.y = 0;
    srcROI.width = src_width;
    srcROI.height = src_height;
    NppiSize srcSize;
    srcSize.width = src_width;
    srcSize.height = src_height;
    int srcStep = src_width * sizeof(float);

    NppiRect dstROI;
    dstROI.x = 0;
    dstROI.y = 0;
    dstROI.width = dst_width;
    dstROI.height = dst_height;
    NppiSize dstSize;
    dstSize.width = dst_width;
    dstSize.height = dst_height;
    int dstStep = dst_width * sizeof(float);

    int eInterpolation = NPPI_INTER_CUBIC;
    NppStreamContext nppStreamContext{};

    NppStatus status = nppiResize_32f_C1R_Ctx(dev_src, srcStep, srcSize,srcROI, dev_dst, dstStep, dstSize, dstROI, eInterpolation, nppStreamContext);

    fprintf(stderr, "NppStatus status = %d\n", status);

    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching Kernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.

    cudaStatus = cudaMemcpy(dst, dev_dst, memsizeDst, cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }

Error:
    cudaFree(dev_src);
    cudaFree(dev_dst);

    return (int)cudaStatus;
}