this is my first not-so-deep-dive into NDK.
I wanted to rewrite this code to NDK for performance purposes. My c
file looks like this:
#include <jni.h>
#include <stdbool.h>
#include <stdio.h>
#include <time.h>
#include <android/log.h>
JNIEXPORT jbyteArray JNICALL
Java_com_company_app_tools_NV21FrameRotator_rotateNV21(JNIEnv *env, jclass thiz,
jbyteArray data, jbyteArray output,
jint width, jint height, jint rotation) {
clock_t start, end;
double cpu_time_used;
start = clock();
jbyte *dataPtr = (*env)->GetByteArrayElements(env, data, NULL);
jbyte *outputPtr = (*env)->GetByteArrayElements(env, output, NULL);
unsigned int frameSize = width * height;
bool swap = rotation % 180 != 0;
bool xflip = rotation % 270 != 0;
bool yflip = rotation >= 180;
for (unsigned int j = 0; j < height; j++) {
for (unsigned int i = 0; i < width; i++) {
unsigned int yIn = j * width + i;
unsigned int uIn = frameSize + (j >> 1u) * width + (i & ~1u);
unsigned int vIn = uIn + 1;
unsigned int wOut = swap ? height : width;
unsigned int hOut = swap ? width : height;
unsigned int iSwapped = swap ? j : i;
unsigned int jSwapped = swap ? i : j;
unsigned int iOut = xflip ? wOut - iSwapped - 1 : iSwapped;
unsigned int jOut = yflip ? hOut - jSwapped - 1 : jSwapped;
unsigned int yOut = jOut * wOut + iOut;
unsigned int uOut = frameSize + (jOut >> 1u) * wOut + (iOut & ~1u);
unsigned int vOut = uOut + 1;
outputPtr[yOut] = (jbyte) (0xff & dataPtr[yIn]);
outputPtr[uOut] = (jbyte) (0xff & dataPtr[uIn]);
outputPtr[vOut] = (jbyte) (0xff & dataPtr[vIn]);
}
}
(*env)->ReleaseByteArrayElements(env, data, dataPtr, 0);
(*env)->ReleaseByteArrayElements(env, output, outputPtr, 0);
end = clock();
cpu_time_used = ((double) (end - start)) / CLOCKS_PER_SEC;
char str[10];
sprintf(str, "%f", cpu_time_used * 1000);
__android_log_write(ANDROID_LOG_ERROR, "NV21FrameRotator", str);
return output;
}
both snippets (linked Java and above) works well, but when I measure processing duration it looks like on same device Java version takes about 7 ms and C 12-13 ms (Log.i(
Java side log)... Shouldn't be faster? Where is the catch?
long micros = System.nanoTime() / 1000;
// ~7ms, Java
//data = rotateNV21(inputData, width, height, rotateCameraDegrees);
// ~12-13ms, C
NV21FrameRotator.rotateNV21(inputData, data, width, height, rotateCameraDegrees);
Log.d(TAG, "Last frame processing duration: " + (System.nanoTime() / 1000 - micros) + "µs");
PS. Java log sometimes is showing shorter duration than native clock()
measurement in c
file... sample log:
NV21FrameRotator: 7.942000
NV21RotatorJava: Last frame processing duration: 7403µs
NV21FrameRotator: 7.229000
NV21RotatorJava: Last frame processing duration: 7166µs
NV21FrameRotator: 16.918000
NV21RotatorJava: Last frame processing duration: 20644µs
NV21FrameRotator: 19.594000
NV21RotatorJava: Last frame processing duration: 20479µs
NV21FrameRotator: 9.484000
NV21RotatorJava: Last frame processing duration: 7274µs
from Code translated to C/NDK/JNI less efficient than Java original
No comments:
Post a Comment