36 #include "NE10_math.h"
40 ne10_func_2args_t ftbl_2args[MAX_FUNC_COUNT];
41 ne10_func_3args_t ftbl_3args[MAX_FUNC_COUNT];
42 ne10_func_4args_t ftbl_4args[MAX_FUNC_COUNT];
43 ne10_func_5args_t ftbl_5args[MAX_FUNC_COUNT];
44 ne10_func_3args_cst_t ftbl_3args_cst[MAX_FUNC_COUNT];
45 ne10_func_4args_cst_t ftbl_4args_cst[MAX_FUNC_COUNT];
46 ne10_func_5args_cst_t ftbl_5args_cst[MAX_FUNC_COUNT];
49 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
50 static ne10_float32_t * guarded_acc = NULL;
51 static ne10_float32_t * guarded_src1 = NULL;
52 static ne10_float32_t * guarded_src2 = NULL;
53 static ne10_float32_t * guarded_cst = NULL;
54 static ne10_float32_t * theacc = NULL;
55 static ne10_float32_t * thesrc1 = NULL;
56 static ne10_float32_t * thesrc2 = NULL;
57 static ne10_float32_t * thecst = NULL;
59 static ne10_float32_t * guarded_dst_c = NULL;
60 static ne10_float32_t * guarded_dst_neon = NULL;
61 static ne10_float32_t * thedst_c = NULL;
62 static ne10_float32_t * thedst_neon = NULL;
65 #ifdef PERFORMANCE_TEST
66 static ne10_float32_t * perftest_guarded_acc = NULL;
67 static ne10_float32_t * perftest_guarded_src1 = NULL;
68 static ne10_float32_t * perftest_guarded_src2 = NULL;
69 static ne10_float32_t * perftest_guarded_cst = NULL;
70 static ne10_float32_t * perftest_theacc = NULL;
71 static ne10_float32_t * perftest_thesrc1 = NULL;
72 static ne10_float32_t * perftest_thesrc2 = NULL;
73 static ne10_float32_t * perftest_thecst = NULL;
75 static ne10_float32_t * perftest_thedst_c = NULL;
76 static ne10_float32_t * perftest_guarded_dst_c = NULL;
77 static ne10_float32_t * perftest_guarded_dst_neon = NULL;
78 static ne10_float32_t * perftest_thedst_neon = NULL;
79 static ne10_uint32_t perftest_length = 0;
81 static ne10_int64_t time_c = 0;
82 static ne10_int64_t time_neon = 0;
83 static ne10_float32_t time_speedup = 0.0f;
84 static ne10_float32_t time_savings = 0.0f;
89 #define MAX_VEC_COMPONENTS 4
91 ne10_int32_t func_loop;
94 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
95 ftbl_3args[ 0] = (ne10_func_3args_t) ne10_abs_float_c;
96 ftbl_3args[ 1] = (ne10_func_3args_t) ne10_abs_float_neon;
97 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_abs_vec2f_c;
98 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_abs_vec2f_neon;
99 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_abs_vec3f_c;
100 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_abs_vec3f_neon;
101 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_abs_vec4f_c;
102 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_abs_vec4f_neon;
104 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
106 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
107 ne10_int32_t vec_size;
109 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
112 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
115 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
116 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
118 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
120 for (loop = 0; loop < TEST_ITERATION; loop++)
122 vec_size = func_loop + 1;
124 GUARD_ARRAY (thedst_c, loop * vec_size);
125 GUARD_ARRAY (thedst_neon, loop * vec_size);
127 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
128 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
130 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
131 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
133 for (pos = 0; pos < loop; pos++)
137 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
138 for (i = 0; i < vec_size; i++)
140 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
143 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
148 free (guarded_dst_c);
149 free (guarded_dst_neon);
152 #ifdef PERFORMANCE_TEST
153 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
154 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
156 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
159 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
160 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
162 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
165 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
168 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
170 time_speedup = (ne10_float32_t) time_c / time_neon;
171 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
172 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
175 free (perftest_guarded_src1);
176 free (perftest_guarded_dst_c);
177 free (perftest_guarded_dst_neon);
180 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
181 #undef MAX_VEC_COMPONENTS
184 void test_addc_case0()
186 #define MAX_VEC_COMPONENTS 4
188 ne10_int32_t func_loop;
190 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
193 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
194 memset (ftbl_4args_cst, 0,
sizeof (ftbl_4args_cst));
195 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_addc_float_c;
196 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_addc_float_neon;
197 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addc_vec2f_c;
198 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addc_vec2f_neon;
199 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addc_vec3f_c;
200 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addc_vec3f_neon;
201 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addc_vec4f_c;
202 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addc_vec4f_neon;
204 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
205 ne10_int32_t vec_size;
207 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
210 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
211 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
214 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
215 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
217 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
219 for (loop = 0; loop < TEST_ITERATION; loop++)
221 vec_size = func_loop + 1;
223 GUARD_ARRAY (thedst_c, loop * vec_size);
224 GUARD_ARRAY (thedst_neon, loop * vec_size);
228 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
229 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
233 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
234 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
238 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
239 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
241 for (pos = 0; pos < loop; pos++)
245 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
246 for (i = 0; i < vec_size; i++)
248 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
249 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
252 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
258 free (guarded_dst_c);
259 free (guarded_dst_neon);
262 #ifdef PERFORMANCE_TEST
263 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
264 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
266 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
267 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
270 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
271 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
273 for (func_loop = 0; func_loop < 1; func_loop++)
276 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
279 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
281 time_speedup = (ne10_float32_t) time_c / time_neon;
282 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
283 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
285 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
288 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
291 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
293 time_speedup = (ne10_float32_t) time_c / time_neon;
294 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
295 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
298 free (perftest_guarded_src1);
299 free (perftest_guarded_cst);
300 free (perftest_guarded_dst_c);
301 free (perftest_guarded_dst_neon);
304 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
305 #undef MAX_VEC_COMPONENTS
308 void test_add_case0()
310 #define MAX_VEC_COMPONENTS 4
312 ne10_int32_t func_loop;
314 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
317 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
318 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_add_float_c;
319 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_add_float_neon;
320 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_add_vec2f_c;
321 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_add_vec2f_neon;
322 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_add_vec3f_c;
323 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_add_vec3f_neon;
324 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_add_vec4f_c;
325 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_add_vec4f_neon;
327 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
328 ne10_int32_t vec_size;
330 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
333 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
334 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
337 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
338 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
340 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
342 for (loop = 0; loop < TEST_ITERATION; loop++)
344 vec_size = func_loop + 1;
346 GUARD_ARRAY (thedst_c, loop * vec_size);
347 GUARD_ARRAY (thedst_neon, loop * vec_size);
349 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
350 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
352 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
353 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
355 for (pos = 0; pos < loop; pos++)
359 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
360 for (i = 0; i < vec_size; i++)
362 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
363 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
366 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
372 free (guarded_dst_c);
373 free (guarded_dst_neon);
376 #ifdef PERFORMANCE_TEST
377 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
378 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
380 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
381 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
384 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
385 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
387 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
390 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
393 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
395 time_speedup = (ne10_float32_t) time_c / time_neon;
396 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
397 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
400 free (perftest_guarded_src1);
401 free (perftest_guarded_src2);
402 free (perftest_guarded_dst_c);
403 free (perftest_guarded_dst_neon);
406 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
407 #undef MAX_VEC_COMPONENTS
410 void test_cross_case0()
412 #define MAX_VEC_COMPONENTS 3
414 ne10_int32_t func_loop;
416 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
419 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
420 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_cross_vec3f_c;
421 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_cross_vec3f_neon;
423 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
424 ne10_int32_t vec_size;
426 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
429 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
430 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
433 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
434 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
436 for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
438 for (loop = 0; loop < TEST_ITERATION; loop++)
440 vec_size = func_loop + 1;
442 GUARD_ARRAY (thedst_c, loop * vec_size);
443 GUARD_ARRAY (thedst_neon, loop * vec_size);
445 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
446 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
448 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
449 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
451 for (pos = 0; pos < loop; pos++)
455 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
456 for (i = 0; i < vec_size; i++)
458 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
459 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
462 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
468 free (guarded_dst_c);
469 free (guarded_dst_neon);
472 #ifdef PERFORMANCE_TEST
473 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
474 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
476 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
477 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
480 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
481 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
483 for (func_loop = 2; func_loop < MAX_VEC_COMPONENTS; func_loop++)
486 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
489 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
491 time_speedup = (ne10_float32_t) time_c / time_neon;
492 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
493 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
496 free (perftest_guarded_src1);
497 free (perftest_guarded_src2);
498 free (perftest_guarded_dst_c);
499 free (perftest_guarded_dst_neon);
502 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
503 #undef MAX_VEC_COMPONENTS
506 void test_divc_case0()
508 #define MAX_VEC_COMPONENTS 4
510 ne10_int32_t func_loop;
512 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
515 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
516 memset (ftbl_4args_cst, 0,
sizeof (ftbl_4args_cst));
517 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_divc_float_c;
518 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_divc_float_neon;
519 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_divc_vec2f_c;
520 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_divc_vec2f_neon;
521 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_divc_vec3f_c;
522 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_divc_vec3f_neon;
523 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_divc_vec4f_c;
524 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_divc_vec4f_neon;
526 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
527 ne10_int32_t vec_size;
529 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
532 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
533 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
536 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
537 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
539 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
541 for (loop = 0; loop < TEST_ITERATION; loop++)
543 vec_size = func_loop + 1;
545 GUARD_ARRAY (thedst_c, loop * vec_size);
546 GUARD_ARRAY (thedst_neon, loop * vec_size);
550 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
551 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
555 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
556 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
559 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
560 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
562 for (pos = 0; pos < loop; pos++)
566 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
567 for (i = 0; i < vec_size; i++)
569 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
570 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
573 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
579 free (guarded_dst_c);
580 free (guarded_dst_neon);
583 #ifdef PERFORMANCE_TEST
584 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
585 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
587 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
588 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
594 for (func_loop = 0; func_loop < 1; func_loop++)
597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
602 time_speedup = (ne10_float32_t) time_c / time_neon;
603 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
604 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
606 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
609 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
612 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
614 time_speedup = (ne10_float32_t) time_c / time_neon;
615 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
616 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
619 free (perftest_guarded_src1);
620 free (perftest_guarded_cst);
621 free (perftest_guarded_dst_c);
622 free (perftest_guarded_dst_neon);
625 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
626 #undef MAX_VEC_COMPONENTS
629 void test_div_case0()
631 #define MAX_VEC_COMPONENTS 4
633 ne10_int32_t func_loop;
635 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
638 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
639 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_div_float_c;
640 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_div_float_neon;
641 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vdiv_vec2f_c;
642 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vdiv_vec2f_neon;
643 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vdiv_vec3f_c;
644 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vdiv_vec3f_neon;
645 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vdiv_vec4f_c;
646 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vdiv_vec4f_neon;
648 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
649 ne10_int32_t vec_size;
651 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
654 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
655 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
658 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
659 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
661 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
663 for (loop = 0; loop < TEST_ITERATION; loop++)
665 vec_size = func_loop + 1;
667 GUARD_ARRAY (thedst_c, loop * vec_size);
668 GUARD_ARRAY (thedst_neon, loop * vec_size);
670 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
671 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
673 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
674 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
676 for (pos = 0; pos < loop; pos++)
680 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
681 for (i = 0; i < vec_size; i++)
683 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
684 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
687 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
693 free (guarded_dst_c);
694 free (guarded_dst_neon);
697 #ifdef PERFORMANCE_TEST
698 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
699 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
701 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
702 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
705 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
706 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
708 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
711 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
714 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
716 time_speedup = (ne10_float32_t) time_c / time_neon;
717 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
718 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
721 free (perftest_guarded_src1);
722 free (perftest_guarded_src2);
723 free (perftest_guarded_dst_c);
724 free (perftest_guarded_dst_neon);
727 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
728 #undef MAX_VEC_COMPONENTS
731 void test_dot_case0()
733 #define MAX_VEC_COMPONENTS 4
735 ne10_int32_t func_loop;
737 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
740 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
741 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_dot_vec2f_c;
742 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_dot_vec2f_neon;
743 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_dot_vec3f_c;
744 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_dot_vec3f_neon;
745 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_dot_vec4f_c;
746 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_dot_vec4f_neon;
748 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
750 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
753 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
754 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
757 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
758 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
760 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
762 for (loop = 0; loop < TEST_ITERATION; loop++)
765 ne10_int32_t vec_size = func_loop + 1;
768 GUARD_ARRAY (thedst_c, loop);
769 GUARD_ARRAY (thedst_neon, loop);
771 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
772 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
774 CHECK_ARRAY_GUARD (thedst_c, loop);
775 CHECK_ARRAY_GUARD (thedst_neon, loop);
777 for (pos = 0; pos < loop; pos++)
781 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
782 for (i = 0; i < vec_size; i++)
784 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
785 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
788 assert_float_vec_equal (&thedst_c[pos], &thedst_neon[pos], ERROR_MARGIN_SMALL, 1);
794 free (guarded_dst_c);
795 free (guarded_dst_neon);
798 #ifdef PERFORMANCE_TEST
799 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
800 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
802 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
803 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
806 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
807 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
809 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
812 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
817 time_speedup = (ne10_float32_t) time_c / time_neon;
818 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
819 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
822 free (perftest_guarded_src1);
823 free (perftest_guarded_src2);
824 free (perftest_guarded_dst_c);
825 free (perftest_guarded_dst_neon);
828 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
829 #undef MAX_VEC_COMPONENTS
832 void test_len_case0()
834 #define MAX_VEC_COMPONENTS 4
836 ne10_int32_t func_loop;
839 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
840 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_len_vec2f_c;
841 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_len_vec2f_neon;
842 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_len_vec3f_c;
843 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_len_vec3f_neon;
844 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_len_vec4f_c;
845 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_len_vec4f_neon;
847 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
849 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
850 ne10_int32_t vec_size;
852 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
855 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
858 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
859 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
861 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
863 for (loop = 0; loop < TEST_ITERATION; loop++)
865 vec_size = func_loop + 1;
867 GUARD_ARRAY (thedst_c, loop);
868 GUARD_ARRAY (thedst_neon, loop);
870 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
871 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
873 CHECK_ARRAY_GUARD (thedst_c, loop);
874 CHECK_ARRAY_GUARD (thedst_neon, loop);
876 for (pos = 0; pos < loop; pos++)
880 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
881 for (i = 0; i < vec_size; i++)
883 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
886 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, 1);
891 free (guarded_dst_c);
892 free (guarded_dst_neon);
895 #ifdef PERFORMANCE_TEST
896 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
897 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
899 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
902 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
903 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
905 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
908 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
911 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
913 time_speedup = (ne10_float32_t) time_c / time_neon;
914 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
915 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
918 free (perftest_guarded_src1);
919 free (perftest_guarded_dst_c);
920 free (perftest_guarded_dst_neon);
923 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
924 #undef MAX_VEC_COMPONENTS
927 void test_mlac_case0()
929 #define MAX_VEC_COMPONENTS 4
931 ne10_int32_t func_loop;
933 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
936 memset (ftbl_5args, 0,
sizeof (ftbl_5args));
937 memset (ftbl_5args_cst, 0,
sizeof (ftbl_5args_cst));
938 ftbl_5args_cst[ 0] = (ne10_func_5args_cst_t) ne10_mlac_float_c;
939 ftbl_5args_cst[ 1] = (ne10_func_5args_cst_t) ne10_mlac_float_neon;
940 ftbl_5args[ 2] = (ne10_func_5args_t) ne10_mlac_vec2f_c;
941 ftbl_5args[ 3] = (ne10_func_5args_t) ne10_mlac_vec2f_neon;
942 ftbl_5args[ 4] = (ne10_func_5args_t) ne10_mlac_vec3f_c;
943 ftbl_5args[ 5] = (ne10_func_5args_t) ne10_mlac_vec3f_neon;
944 ftbl_5args[ 6] = (ne10_func_5args_t) ne10_mlac_vec4f_c;
945 ftbl_5args[ 7] = (ne10_func_5args_t) ne10_mlac_vec4f_neon;
947 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
948 ne10_int32_t vec_size;
950 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
953 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length);
954 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
955 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
958 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
959 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
961 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
963 for (loop = 0; loop < TEST_ITERATION; loop++)
965 vec_size = func_loop + 1;
967 GUARD_ARRAY (thedst_c, loop * vec_size);
968 GUARD_ARRAY (thedst_neon, loop * vec_size);
972 ftbl_5args_cst[2 * func_loop] (thedst_c, theacc, thesrc1, thecst[0], loop);
973 ftbl_5args_cst[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst[0], loop);
977 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thecst, loop);
978 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thecst, loop);
981 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
982 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
984 for (pos = 0; pos < loop; pos++)
988 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
989 for (i = 0; i < vec_size; i++)
991 fprintf (stdout,
"theacc->%d: %f [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
992 fprintf (stdout,
"thesrc->%d: %f [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
993 fprintf (stdout,
"thecst->%d: %f [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
996 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1001 free (guarded_src1);
1003 free (guarded_dst_c);
1004 free (guarded_dst_neon);
1007 #ifdef PERFORMANCE_TEST
1008 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1009 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1011 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length);
1012 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1013 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1016 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1017 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1019 for (func_loop = 0; func_loop < 1; func_loop++)
1022 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1024 GET_TIME (time_neon,
1025 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst[0], loop);
1027 time_speedup = (ne10_float32_t) time_c / time_neon;
1028 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1029 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1031 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1034 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1036 GET_TIME (time_neon,
1037 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thecst, loop);
1039 time_speedup = (ne10_float32_t) time_c / time_neon;
1040 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1041 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1044 free (perftest_guarded_acc);
1045 free (perftest_guarded_src1);
1046 free (perftest_guarded_cst);
1047 free (perftest_guarded_dst_c);
1048 free (perftest_guarded_dst_neon);
1051 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1052 #undef MAX_VEC_COMPONENTS
1055 void test_mla_case0()
1057 #define MAX_VEC_COMPONENTS 4
1059 ne10_int32_t func_loop;
1061 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1064 memset (ftbl_5args, 0,
sizeof (ftbl_5args));
1065 ftbl_5args[ 0] = (ne10_func_5args_t) ne10_mla_float_c;
1066 ftbl_5args[ 1] = (ne10_func_5args_t) ne10_mla_float_neon;
1067 ftbl_5args[ 2] = (ne10_func_5args_t) ne10_vmla_vec2f_c;
1068 ftbl_5args[ 3] = (ne10_func_5args_t) ne10_vmla_vec2f_neon;
1069 ftbl_5args[ 4] = (ne10_func_5args_t) ne10_vmla_vec3f_c;
1070 ftbl_5args[ 5] = (ne10_func_5args_t) ne10_vmla_vec3f_neon;
1071 ftbl_5args[ 6] = (ne10_func_5args_t) ne10_vmla_vec4f_c;
1072 ftbl_5args[ 7] = (ne10_func_5args_t) ne10_vmla_vec4f_neon;
1074 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1075 ne10_int32_t vec_size;
1077 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1080 NE10_SRC_ALLOC_LIMIT (theacc, guarded_acc, fixed_length);
1081 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1082 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1085 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1086 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1088 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1090 for (loop = 0; loop < TEST_ITERATION; loop++)
1092 vec_size = func_loop + 1;
1094 GUARD_ARRAY (thedst_c, loop * vec_size);
1095 GUARD_ARRAY (thedst_neon, loop * vec_size);
1097 ftbl_5args[2 * func_loop] (thedst_c, theacc, thesrc1, thesrc2, loop);
1098 ftbl_5args[2 * func_loop + 1] (thedst_neon, theacc, thesrc1, thesrc2, loop);
1100 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1101 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1103 for (pos = 0; pos < loop; pos++)
1107 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1108 for (i = 0; i < vec_size; i++)
1110 fprintf (stdout,
"theacc->%d: %e [0x%04X] \n", i, theacc[pos * vec_size + i], * (ne10_uint32_t*) &theacc[pos * vec_size + i]);
1111 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1112 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1115 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1120 free (guarded_src1);
1121 free (guarded_src2);
1122 free (guarded_dst_c);
1123 free (guarded_dst_neon);
1126 #ifdef PERFORMANCE_TEST
1127 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1128 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1130 NE10_SRC_ALLOC_LIMIT (perftest_theacc, perftest_guarded_acc, perftest_length);
1131 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1132 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1135 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1136 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1138 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1141 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop] (perftest_thedst_c, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1143 GET_TIME (time_neon,
1144 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_5args[2 * func_loop + 1] (perftest_thedst_neon, perftest_theacc, perftest_thesrc1, perftest_thesrc2, loop);
1146 time_speedup = (ne10_float32_t) time_c / time_neon;
1147 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1148 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1151 free (perftest_guarded_acc);
1152 free (perftest_guarded_src1);
1153 free (perftest_guarded_src2);
1154 free (perftest_guarded_dst_c);
1155 free (perftest_guarded_dst_neon);
1158 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1159 #undef MAX_VEC_COMPONENTS
1162 void test_mulc_case0()
1164 #define MAX_VEC_COMPONENTS 4
1166 ne10_int32_t func_loop;
1168 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1171 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1172 memset (ftbl_4args_cst, 0,
sizeof (ftbl_4args_cst));
1173 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_mulc_float_c;
1174 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_mulc_float_neon;
1175 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulc_vec2f_c;
1176 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulc_vec2f_neon;
1177 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulc_vec3f_c;
1178 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulc_vec3f_neon;
1179 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulc_vec4f_c;
1180 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulc_vec4f_neon;
1182 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1183 ne10_int32_t vec_size;
1185 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1188 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1189 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1192 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1193 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1195 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1197 for (loop = 0; loop < TEST_ITERATION; loop++)
1199 vec_size = func_loop + 1;
1201 GUARD_ARRAY (thedst_c, loop * vec_size);
1202 GUARD_ARRAY (thedst_neon, loop * vec_size);
1206 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1207 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1211 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1212 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1215 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1216 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1218 for (pos = 0; pos < loop; pos++)
1222 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1223 for (i = 0; i < vec_size; i++)
1225 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1226 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1229 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1233 free (guarded_src1);
1235 free (guarded_dst_c);
1236 free (guarded_dst_neon);
1239 #ifdef PERFORMANCE_TEST
1240 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1241 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1243 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1244 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1247 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1248 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1250 for (func_loop = 0; func_loop < 1; func_loop++)
1253 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1255 GET_TIME (time_neon,
1256 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1258 time_speedup = (ne10_float32_t) time_c / time_neon;
1259 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1260 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1262 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1265 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1267 GET_TIME (time_neon,
1268 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1270 time_speedup = (ne10_float32_t) time_c / time_neon;
1271 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1272 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1275 free (perftest_guarded_src1);
1276 free (perftest_guarded_cst);
1277 free (perftest_guarded_dst_c);
1278 free (perftest_guarded_dst_neon);
1281 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1282 #undef MAX_VEC_COMPONENTS
1285 void test_mul_case0()
1287 #define MAX_VEC_COMPONENTS 4
1289 ne10_int32_t func_loop;
1291 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1294 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1295 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_mul_float_c;
1296 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_mul_float_neon;
1297 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_vmul_vec2f_c;
1298 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_vmul_vec2f_neon;
1299 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_vmul_vec3f_c;
1300 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_vmul_vec3f_neon;
1301 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_vmul_vec4f_c;
1302 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_vmul_vec4f_neon;
1304 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1305 ne10_int32_t vec_size;
1307 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1310 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1311 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1314 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1315 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1317 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1319 for (loop = 0; loop < TEST_ITERATION; loop++)
1321 vec_size = func_loop + 1;
1323 GUARD_ARRAY (thedst_c, loop * vec_size);
1324 GUARD_ARRAY (thedst_neon, loop * vec_size);
1326 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1327 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1329 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1330 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1332 for (pos = 0; pos < loop; pos++)
1336 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1337 for (i = 0; i < vec_size; i++)
1339 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1340 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1343 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1347 free (guarded_src1);
1348 free (guarded_src2);
1349 free (guarded_dst_c);
1350 free (guarded_dst_neon);
1353 #ifdef PERFORMANCE_TEST
1354 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1355 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1357 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1358 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1361 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1362 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1364 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1367 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1369 GET_TIME (time_neon,
1370 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1372 time_speedup = (ne10_float32_t) time_c / time_neon;
1373 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1374 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1377 free (perftest_guarded_src1);
1378 free (perftest_guarded_src2);
1379 free (perftest_guarded_dst_c);
1380 free (perftest_guarded_dst_neon);
1383 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1384 #undef MAX_VEC_COMPONENTS
1387 void test_normalize_case0()
1389 #define MAX_VEC_COMPONENTS 4
1391 ne10_int32_t func_loop;
1394 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
1395 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_normalize_vec2f_c;
1396 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_normalize_vec2f_neon;
1397 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_normalize_vec3f_c;
1398 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_normalize_vec3f_neon;
1399 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_normalize_vec4f_c;
1400 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_normalize_vec4f_neon;
1402 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1404 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1405 ne10_int32_t vec_size;
1407 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1410 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1413 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1414 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1416 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1418 for (loop = 0; loop < TEST_ITERATION; loop++)
1420 vec_size = func_loop + 1;
1422 GUARD_ARRAY (thedst_c, loop * vec_size);
1423 GUARD_ARRAY (thedst_neon, loop * vec_size);
1425 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
1426 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
1428 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1429 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1431 for (pos = 0; pos < loop; pos++)
1435 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1436 for (i = 0; i < vec_size; i++)
1438 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1441 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
1445 free (guarded_src1);
1446 free (guarded_dst_c);
1447 free (guarded_dst_neon);
1450 #ifdef PERFORMANCE_TEST
1451 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1452 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1454 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1457 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1458 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1460 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1463 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
1465 GET_TIME (time_neon,
1466 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
1468 time_speedup = (ne10_float32_t) time_c / time_neon;
1469 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1470 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1473 free (perftest_guarded_src1);
1474 free (perftest_guarded_dst_c);
1475 free (perftest_guarded_dst_neon);
1478 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1479 #undef MAX_VEC_COMPONENTS
1482 void test_rsbc_case0()
1484 #define MAX_VEC_COMPONENTS 4
1486 ne10_int32_t func_loop;
1488 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1491 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1492 memset (ftbl_4args_cst, 0,
sizeof (ftbl_4args_cst));
1493 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_rsbc_float_c;
1494 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_rsbc_float_neon;
1495 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_rsbc_vec2f_c;
1496 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_rsbc_vec2f_neon;
1497 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_rsbc_vec3f_c;
1498 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_rsbc_vec3f_neon;
1499 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_rsbc_vec4f_c;
1500 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_rsbc_vec4f_neon;
1502 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1503 ne10_int32_t vec_size;
1505 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1508 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1509 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1512 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1513 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1515 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1517 for (loop = 0; loop < TEST_ITERATION; loop++)
1519 vec_size = func_loop + 1;
1521 GUARD_ARRAY (thedst_c, loop * vec_size);
1522 GUARD_ARRAY (thedst_neon, loop * vec_size);
1526 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1527 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1531 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1532 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1535 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1536 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1538 for (pos = 0; pos < loop; pos++)
1542 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1543 for (i = 0; i < vec_size; i++)
1545 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1546 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1549 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1553 free (guarded_src1);
1555 free (guarded_dst_c);
1556 free (guarded_dst_neon);
1559 #ifdef PERFORMANCE_TEST
1560 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1561 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1563 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1564 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1567 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1568 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1570 for (func_loop = 0; func_loop < 1; func_loop++)
1573 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1575 GET_TIME (time_neon,
1576 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1578 time_speedup = (ne10_float32_t) time_c / time_neon;
1579 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1580 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1582 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1585 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1587 GET_TIME (time_neon,
1588 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1590 time_speedup = (ne10_float32_t) time_c / time_neon;
1591 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1592 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1595 free (perftest_guarded_src1);
1596 free (perftest_guarded_cst);
1597 free (perftest_guarded_dst_c);
1598 free (perftest_guarded_dst_neon);
1601 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1602 #undef MAX_VEC_COMPONENTS
1605 void test_setc_case0()
1607 #define MAX_VEC_COMPONENTS 4
1609 ne10_int32_t func_loop;
1611 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1614 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
1615 memset (ftbl_3args_cst, 0,
sizeof (ftbl_3args_cst));
1616 ftbl_3args_cst[ 0] = (ne10_func_3args_cst_t) ne10_setc_float_c;
1617 ftbl_3args_cst[ 1] = (ne10_func_3args_cst_t) ne10_setc_float_neon;
1618 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_setc_vec2f_c;
1619 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_setc_vec2f_neon;
1620 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_setc_vec3f_c;
1621 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_setc_vec3f_neon;
1622 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_setc_vec4f_c;
1623 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_setc_vec4f_neon;
1625 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1626 ne10_int32_t vec_size;
1628 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1631 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1634 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1635 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1637 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1639 for (loop = 0; loop < TEST_ITERATION; loop++)
1641 vec_size = func_loop + 1;
1643 GUARD_ARRAY (thedst_c, loop * vec_size);
1644 GUARD_ARRAY (thedst_neon, loop * vec_size);
1648 ftbl_3args_cst[2 * func_loop] (thedst_c, thecst[0], loop);
1649 ftbl_3args_cst[2 * func_loop + 1] (thedst_neon, thecst[0], loop);
1653 ftbl_3args[2 * func_loop] (thedst_c, thecst, loop);
1654 ftbl_3args[2 * func_loop + 1] (thedst_neon, thecst, loop);
1657 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1658 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1660 for (pos = 0; pos < loop; pos++)
1664 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1665 for (i = 0; i < vec_size; i++)
1667 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1668 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1671 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1676 free (guarded_dst_c);
1677 free (guarded_dst_neon);
1680 #ifdef PERFORMANCE_TEST
1681 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1682 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1684 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1687 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1688 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1690 for (func_loop = 0; func_loop < 1; func_loop++)
1693 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop] (perftest_thedst_c, perftest_thecst[0], loop);
1695 GET_TIME (time_neon,
1696 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst[0], loop);
1698 time_speedup = (ne10_float32_t) time_c / time_neon;
1699 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1700 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1702 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1705 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thecst, loop);
1707 GET_TIME (time_neon,
1708 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, loop);
1710 time_speedup = (ne10_float32_t) time_c / time_neon;
1711 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1712 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1715 free (perftest_guarded_cst);
1716 free (perftest_guarded_dst_c);
1717 free (perftest_guarded_dst_neon);
1720 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1721 #undef MAX_VEC_COMPONENTS
1724 void test_subc_case0()
1726 #define MAX_VEC_COMPONENTS 4
1728 ne10_int32_t func_loop;
1730 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1733 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1734 memset (ftbl_4args_cst, 0,
sizeof (ftbl_4args_cst));
1735 ftbl_4args_cst[ 0] = (ne10_func_4args_cst_t) ne10_subc_float_c;
1736 ftbl_4args_cst[ 1] = (ne10_func_4args_cst_t) ne10_subc_float_neon;
1737 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_subc_vec2f_c;
1738 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_subc_vec2f_neon;
1739 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_subc_vec3f_c;
1740 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_subc_vec3f_neon;
1741 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_subc_vec4f_c;
1742 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_subc_vec4f_neon;
1744 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1745 ne10_int32_t vec_size;
1747 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1750 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1751 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS);
1754 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1755 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1757 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1759 for (loop = 0; loop < TEST_ITERATION; loop++)
1761 vec_size = func_loop + 1;
1763 GUARD_ARRAY (thedst_c, loop * vec_size);
1764 GUARD_ARRAY (thedst_neon, loop * vec_size);
1768 ftbl_4args_cst[2 * func_loop] (thedst_c, thesrc1, thecst[0], loop);
1769 ftbl_4args_cst[2 * func_loop + 1] (thedst_neon, thesrc1, thecst[0], loop);
1773 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thecst, loop);
1774 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thecst, loop);
1777 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1778 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1780 for (pos = 0; pos < loop; pos++)
1784 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1785 for (i = 0; i < vec_size; i++)
1787 fprintf (stdout,
"thesrc->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1788 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
1791 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1795 free (guarded_src1);
1797 free (guarded_dst_c);
1798 free (guarded_dst_neon);
1801 #ifdef PERFORMANCE_TEST
1802 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1803 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1805 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1806 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS);
1809 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1810 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1812 for (func_loop = 0; func_loop < 1; func_loop++)
1815 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst[0], loop);
1817 GET_TIME (time_neon,
1818 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args_cst[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst[0], loop);
1820 time_speedup = (ne10_float32_t) time_c / time_neon;
1821 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1822 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1824 for (; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1827 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thecst, loop);
1829 GET_TIME (time_neon,
1830 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thecst, loop);
1832 time_speedup = (ne10_float32_t) time_c / time_neon;
1833 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1834 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1837 free (perftest_guarded_src1);
1838 free (perftest_guarded_cst);
1839 free (perftest_guarded_dst_c);
1840 free (perftest_guarded_dst_neon);
1843 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1844 #undef MAX_VEC_COMPONENTS
1847 void test_sub_case0()
1849 #define MAX_VEC_COMPONENTS 4
1851 ne10_int32_t func_loop;
1853 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1856 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1857 ftbl_4args[ 0] = (ne10_func_4args_t) ne10_sub_float_c;
1858 ftbl_4args[ 1] = (ne10_func_4args_t) ne10_sub_float_neon;
1859 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_sub_vec2f_c;
1860 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_sub_vec2f_neon;
1861 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_sub_vec3f_c;
1862 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_sub_vec3f_neon;
1863 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_sub_vec4f_c;
1864 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_sub_vec4f_neon;
1866 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1867 ne10_int32_t vec_size;
1869 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
1872 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1873 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1876 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1877 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1879 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1881 for (loop = 0; loop < TEST_ITERATION; loop++)
1883 vec_size = func_loop + 1;
1885 GUARD_ARRAY (thedst_c, loop * vec_size);
1886 GUARD_ARRAY (thedst_neon, loop * vec_size);
1888 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1889 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1891 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1892 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1894 for (pos = 0; pos < loop; pos++)
1898 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1899 for (i = 0; i < vec_size; i++)
1901 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
1902 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
1905 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
1909 free (guarded_src1);
1910 free (guarded_src2);
1911 free (guarded_dst_c);
1912 free (guarded_dst_neon);
1915 #ifdef PERFORMANCE_TEST
1916 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
1917 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
1919 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
1920 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
1923 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
1924 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
1926 for (func_loop = 0; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1929 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
1931 GET_TIME (time_neon,
1932 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
1934 time_speedup = (ne10_float32_t) time_c / time_neon;
1935 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
1936 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
1939 free (perftest_guarded_src1);
1940 free (perftest_guarded_src2);
1941 free (perftest_guarded_dst_c);
1942 free (perftest_guarded_dst_neon);
1945 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
1946 #undef MAX_VEC_COMPONENTS
1949 void test_addmat_case0()
1951 #define MAX_VEC_COMPONENTS 4
1953 ne10_int32_t func_loop;
1955 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
1958 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
1959 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_addmat_2x2f_c;
1960 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_addmat_2x2f_neon;
1961 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_addmat_3x3f_c;
1962 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_addmat_3x3f_neon;
1963 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_addmat_4x4f_c;
1964 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_addmat_4x4f_neon;
1966 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
1967 ne10_int32_t vec_size;
1969 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
1972 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
1973 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
1976 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
1977 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
1979 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
1981 for (loop = 0; loop < TEST_ITERATION; loop++)
1983 vec_size = (func_loop + 1) * (func_loop + 1);
1985 GUARD_ARRAY (thedst_c, loop * vec_size);
1986 GUARD_ARRAY (thedst_neon, loop * vec_size);
1988 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
1989 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
1991 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
1992 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
1994 for (pos = 0; pos < loop; pos++)
1998 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
1999 for (i = 0; i < vec_size; i++)
2001 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2002 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2005 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2009 free (guarded_src1);
2010 free (guarded_src2);
2011 free (guarded_dst_c);
2012 free (guarded_dst_neon);
2015 #ifdef PERFORMANCE_TEST
2016 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2017 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2019 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2020 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2023 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2024 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2026 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2029 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2031 GET_TIME (time_neon,
2032 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2034 time_speedup = (ne10_float32_t) time_c / time_neon;
2035 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2036 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2039 free (perftest_guarded_src1);
2040 free (perftest_guarded_src2);
2041 free (perftest_guarded_dst_c);
2042 free (perftest_guarded_dst_neon);
2045 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2046 #undef MAX_VEC_COMPONENTS
2049 void test_detmat_case0()
2051 #define MAX_VEC_COMPONENTS 4
2053 ne10_int32_t func_loop;
2055 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2058 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
2059 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_detmat_2x2f_c;
2060 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_detmat_2x2f_neon;
2061 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_detmat_3x3f_c;
2062 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_detmat_3x3f_neon;
2063 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_detmat_4x4f_c;
2064 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_detmat_4x4f_neon;
2066 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2067 ne10_int32_t vec_size;
2069 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2072 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2075 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2076 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2078 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2080 for (loop = 0; loop < TEST_ITERATION; loop++)
2082 vec_size = (func_loop + 1) * (func_loop + 1);
2084 GUARD_ARRAY (thedst_c, loop);
2085 GUARD_ARRAY (thedst_neon, loop);
2087 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2088 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2090 CHECK_ARRAY_GUARD (thedst_c, loop);
2091 CHECK_ARRAY_GUARD (thedst_neon, loop);
2093 for (pos = 0; pos < loop; pos++)
2097 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2098 for (i = 0; i < vec_size; i++)
2100 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2103 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, 1);
2107 free (guarded_src1);
2108 free (guarded_dst_c);
2109 free (guarded_dst_neon);
2112 #ifdef PERFORMANCE_TEST
2113 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2114 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2116 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2119 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2120 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2122 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2125 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2127 GET_TIME (time_neon,
2128 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2130 time_speedup = (ne10_float32_t) time_c / time_neon;
2131 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2132 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2135 free (perftest_guarded_src1);
2136 free (perftest_guarded_dst_c);
2137 free (perftest_guarded_dst_neon);
2140 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2141 #undef MAX_VEC_COMPONENTS
2144 void test_identitymat_case0()
2146 #define MAX_VEC_COMPONENTS 4
2148 ne10_int32_t func_loop;
2150 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2153 memset (ftbl_2args, 0,
sizeof (ftbl_2args));
2154 ftbl_2args[ 2] = (ne10_func_2args_t) ne10_identitymat_2x2f_c;
2155 ftbl_2args[ 3] = (ne10_func_2args_t) ne10_identitymat_2x2f_neon;
2156 ftbl_2args[ 4] = (ne10_func_2args_t) ne10_identitymat_3x3f_c;
2157 ftbl_2args[ 5] = (ne10_func_2args_t) ne10_identitymat_3x3f_neon;
2158 ftbl_2args[ 6] = (ne10_func_2args_t) ne10_identitymat_4x4f_c;
2159 ftbl_2args[ 7] = (ne10_func_2args_t) ne10_identitymat_4x4f_neon;
2161 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2162 ne10_int32_t vec_size;
2164 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2167 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2168 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2170 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2172 for (loop = 0; loop < TEST_ITERATION; loop++)
2174 vec_size = (func_loop + 1) * (func_loop + 1);
2176 GUARD_ARRAY (thedst_c, loop * vec_size);
2177 GUARD_ARRAY (thedst_neon, loop * vec_size);
2179 ftbl_2args[2 * func_loop] (thedst_c, loop);
2180 ftbl_2args[2 * func_loop + 1] (thedst_neon, loop);
2182 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2183 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2185 for (pos = 0; pos < loop; pos++)
2188 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2190 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2194 free (guarded_dst_c);
2195 free (guarded_dst_neon);
2198 #ifdef PERFORMANCE_TEST
2199 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2200 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2202 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2203 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2205 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2208 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop] (perftest_thedst_c, loop);
2210 GET_TIME (time_neon,
2211 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_2args[2 * func_loop + 1] (perftest_thedst_neon, loop);
2213 time_speedup = (ne10_float32_t) time_c / time_neon;
2214 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2215 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2218 free (perftest_guarded_dst_c);
2219 free (perftest_guarded_dst_neon);
2222 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2223 #undef MAX_VEC_COMPONENTS
2226 void test_invmat_case0()
2228 #define MAX_VEC_COMPONENTS 4
2230 ne10_int32_t func_loop;
2232 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2235 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
2236 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_invmat_2x2f_c;
2237 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_invmat_2x2f_neon;
2238 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_invmat_3x3f_c;
2239 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_invmat_3x3f_neon;
2240 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_invmat_4x4f_c;
2241 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_invmat_4x4f_neon;
2243 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2244 ne10_int32_t vec_size;
2246 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2249 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2252 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2253 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2255 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2257 for (loop = 0; loop < TEST_ITERATION; loop++)
2259 vec_size = (func_loop + 1) * (func_loop + 1);
2261 GUARD_ARRAY (thedst_c, loop * vec_size);
2262 GUARD_ARRAY (thedst_neon, loop * vec_size);
2264 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2265 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2267 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2268 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2270 for (pos = 0; pos < loop; pos++)
2274 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2275 for (i = 0; i < vec_size; i++)
2277 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2280 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_LARGE, vec_size);
2284 free (guarded_src1);
2285 free (guarded_dst_c);
2286 free (guarded_dst_neon);
2289 #ifdef PERFORMANCE_TEST
2290 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2291 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2293 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2296 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2297 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2299 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2302 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2304 GET_TIME (time_neon,
2305 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2307 time_speedup = (ne10_float32_t) time_c / time_neon;
2308 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2309 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2312 free (perftest_guarded_src1);
2313 free (perftest_guarded_dst_c);
2314 free (perftest_guarded_dst_neon);
2317 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2318 #undef MAX_VEC_COMPONENTS
2321 void test_mulmat_case0()
2323 #define MAX_VEC_COMPONENTS 4
2325 ne10_int32_t func_loop;
2327 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2330 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
2331 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulmat_2x2f_c;
2332 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulmat_2x2f_neon;
2333 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulmat_3x3f_c;
2334 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulmat_3x3f_neon;
2335 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulmat_4x4f_c;
2336 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulmat_4x4f_neon;
2338 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2339 ne10_int32_t vec_size;
2341 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2344 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2345 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
2348 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2349 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2351 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2353 for (loop = 0; loop < TEST_ITERATION; loop++)
2355 vec_size = (func_loop + 1) * (func_loop + 1);
2357 GUARD_ARRAY (thedst_c, loop * vec_size);
2358 GUARD_ARRAY (thedst_neon, loop * vec_size);
2360 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2361 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2363 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2364 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2366 for (pos = 0; pos < loop; pos++)
2370 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2371 for (i = 0; i < vec_size; i++)
2373 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2374 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2377 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2381 free (guarded_src1);
2382 free (guarded_src2);
2383 free (guarded_dst_c);
2384 free (guarded_dst_neon);
2387 #ifdef PERFORMANCE_TEST
2388 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2389 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2391 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2392 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2395 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2396 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2398 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2401 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2403 GET_TIME (time_neon,
2404 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2406 time_speedup = (ne10_float32_t) time_c / time_neon;
2407 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2408 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2411 free (perftest_guarded_src1);
2412 free (perftest_guarded_src2);
2413 free (perftest_guarded_dst_c);
2414 free (perftest_guarded_dst_neon);
2417 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2418 #undef MAX_VEC_COMPONENTS
2421 void test_submat_case0()
2423 #define MAX_VEC_COMPONENTS 4
2425 ne10_int32_t func_loop;
2427 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2430 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
2431 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_submat_2x2f_c;
2432 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_submat_2x2f_neon;
2433 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_submat_3x3f_c;
2434 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_submat_3x3f_neon;
2435 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_submat_4x4f_c;
2436 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_submat_4x4f_neon;
2438 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2439 ne10_int32_t vec_size;
2441 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2444 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2445 NE10_SRC_ALLOC_LIMIT (thesrc2, guarded_src2, fixed_length);
2448 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2449 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2451 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2453 for (loop = 0; loop < TEST_ITERATION; loop++)
2455 vec_size = (func_loop + 1) * (func_loop + 1);
2457 GUARD_ARRAY (thedst_c, loop * vec_size);
2458 GUARD_ARRAY (thedst_neon, loop * vec_size);
2460 ftbl_4args[2 * func_loop] (thedst_c, thesrc1, thesrc2, loop);
2461 ftbl_4args[2 * func_loop + 1] (thedst_neon, thesrc1, thesrc2, loop);
2463 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2464 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2466 for (pos = 0; pos < loop; pos++)
2470 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2471 for (i = 0; i < vec_size; i++)
2473 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2474 fprintf (stdout,
"thesrc2->%d: %e [0x%04X] \n", i, thesrc2[pos * vec_size + i], * (ne10_uint32_t*) &thesrc2[pos * vec_size + i]);
2477 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2481 free (guarded_src1);
2482 free (guarded_src2);
2483 free (guarded_dst_c);
2484 free (guarded_dst_neon);
2487 #ifdef PERFORMANCE_TEST
2488 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2489 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2491 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2492 NE10_SRC_ALLOC_LIMIT (perftest_thesrc2, perftest_guarded_src2, perftest_length);
2495 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2496 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2498 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2501 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, perftest_thesrc2, loop);
2503 GET_TIME (time_neon,
2504 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, perftest_thesrc2, loop);
2506 time_speedup = (ne10_float32_t) time_c / time_neon;
2507 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2508 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2511 free (perftest_guarded_src1);
2512 free (perftest_guarded_src2);
2513 free (perftest_guarded_dst_c);
2514 free (perftest_guarded_dst_neon);
2517 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2518 #undef MAX_VEC_COMPONENTS
2521 void test_transmat_case0()
2523 #define MAX_VEC_COMPONENTS 4
2525 ne10_int32_t func_loop;
2527 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2530 memset (ftbl_3args, 0,
sizeof (ftbl_3args));
2531 ftbl_3args[ 2] = (ne10_func_3args_t) ne10_transmat_2x2f_c;
2532 ftbl_3args[ 3] = (ne10_func_3args_t) ne10_transmat_2x2f_neon;
2533 ftbl_3args[ 4] = (ne10_func_3args_t) ne10_transmat_3x3f_c;
2534 ftbl_3args[ 5] = (ne10_func_3args_t) ne10_transmat_3x3f_neon;
2535 ftbl_3args[ 6] = (ne10_func_3args_t) ne10_transmat_4x4f_c;
2536 ftbl_3args[ 7] = (ne10_func_3args_t) ne10_transmat_4x4f_neon;
2538 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2539 ne10_int32_t vec_size;
2541 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2544 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2547 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2548 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2550 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2552 for (loop = 0; loop < TEST_ITERATION; loop++)
2554 vec_size = (func_loop + 1) * (func_loop + 1);
2556 GUARD_ARRAY (thedst_c, loop * vec_size);
2557 GUARD_ARRAY (thedst_neon, loop * vec_size);
2559 ftbl_3args[2 * func_loop] (thedst_c, thesrc1, loop);
2560 ftbl_3args[2 * func_loop + 1] (thedst_neon, thesrc1, loop);
2562 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2563 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2565 for (pos = 0; pos < loop; pos++)
2569 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2570 for (i = 0; i < vec_size; i++)
2572 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2575 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2579 free (guarded_src1);
2580 free (guarded_dst_c);
2581 free (guarded_dst_neon);
2584 #ifdef PERFORMANCE_TEST
2585 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2586 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS;
2588 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2591 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2592 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2594 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2597 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop] (perftest_thedst_c, perftest_thesrc1, loop);
2599 GET_TIME (time_neon,
2600 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_3args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thesrc1, loop);
2602 time_speedup = (ne10_float32_t) time_c / time_neon;
2603 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2604 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2607 free (perftest_guarded_src1);
2608 free (perftest_guarded_dst_c);
2609 free (perftest_guarded_dst_neon);
2612 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2613 #undef MAX_VEC_COMPONENTS
2616 void test_mulcmatvec_case0()
2618 #define MAX_VEC_COMPONENTS 4
2620 ne10_int32_t func_loop;
2622 fprintf (stdout,
"----------%30s start\n", __FUNCTION__);
2625 memset (ftbl_4args, 0,
sizeof (ftbl_4args));
2626 ftbl_4args[ 2] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_c;
2627 ftbl_4args[ 3] = (ne10_func_4args_t) ne10_mulcmatvec_cm2x2f_v2f_neon;
2628 ftbl_4args[ 4] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_c;
2629 ftbl_4args[ 5] = (ne10_func_4args_t) ne10_mulcmatvec_cm3x3f_v3f_neon;
2630 ftbl_4args[ 6] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_c;
2631 ftbl_4args[ 7] = (ne10_func_4args_t) ne10_mulcmatvec_cm4x4f_v4f_neon;
2633 #if defined (SMOKE_TEST)||(REGRESSION_TEST)
2634 ne10_int32_t vec_size;
2636 const ne10_uint32_t fixed_length = TEST_ITERATION * MAX_VEC_COMPONENTS;
2639 NE10_SRC_ALLOC_LIMIT (thesrc1, guarded_src1, fixed_length);
2640 NE10_SRC_ALLOC_LIMIT (thecst, guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS);
2643 NE10_DST_ALLOC (thedst_c, guarded_dst_c, fixed_length);
2644 NE10_DST_ALLOC (thedst_neon, guarded_dst_neon, fixed_length);
2646 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2648 for (loop = 0; loop < TEST_ITERATION; loop++)
2650 vec_size = func_loop + 1;
2652 GUARD_ARRAY (thedst_c, loop * vec_size);
2653 GUARD_ARRAY (thedst_neon, loop * vec_size);
2655 ftbl_4args[2 * func_loop] (thedst_c, thecst, thesrc1, loop);
2656 ftbl_4args[2 * func_loop + 1] (thedst_neon, thecst, thesrc1, loop);
2658 CHECK_ARRAY_GUARD (thedst_c, loop * vec_size);
2659 CHECK_ARRAY_GUARD (thedst_neon, loop * vec_size);
2661 for (pos = 0; pos < loop; pos++)
2665 fprintf (stdout,
"func: %d loop count: %d position: %d \n", func_loop, loop, pos);
2666 for (i = 0; i < vec_size * vec_size; i++)
2668 fprintf (stdout,
"thecst->%d: %e [0x%04X] \n", i, thecst[i], * (ne10_uint32_t*) &thecst[i]);
2670 for (i = 0; i < vec_size; i++)
2672 fprintf (stdout,
"thesrc1->%d: %e [0x%04X] \n", i, thesrc1[pos * vec_size + i], * (ne10_uint32_t*) &thesrc1[pos * vec_size + i]);
2675 assert_float_vec_equal (&thedst_c[pos * vec_size], &thedst_neon[pos * vec_size], ERROR_MARGIN_SMALL, vec_size);
2679 free (guarded_src1);
2681 free (guarded_dst_c);
2682 free (guarded_dst_neon);
2685 #ifdef PERFORMANCE_TEST
2686 fprintf (stdout,
"%25s%20s%20s%20s%20s\n",
"N-component Vector",
"C Time in ms",
"NEON Time in ms",
"Time Savings",
"Performance Ratio");
2687 perftest_length = PERF_TEST_ITERATION * MAX_VEC_COMPONENTS;
2689 NE10_SRC_ALLOC_LIMIT (perftest_thesrc1, perftest_guarded_src1, perftest_length);
2690 NE10_SRC_ALLOC_LIMIT (perftest_thecst, perftest_guarded_cst, MAX_VEC_COMPONENTS * MAX_VEC_COMPONENTS);
2693 NE10_DST_ALLOC (perftest_thedst_c, perftest_guarded_dst_c, perftest_length);
2694 NE10_DST_ALLOC (perftest_thedst_neon, perftest_guarded_dst_neon, perftest_length);
2696 for (func_loop = 1; func_loop < MAX_VEC_COMPONENTS; func_loop++)
2699 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop] (perftest_thedst_c, perftest_thecst, perftest_thesrc1, loop);
2701 GET_TIME (time_neon,
2702 for (loop = 0; loop < PERF_TEST_ITERATION; loop++) ftbl_4args[2 * func_loop + 1] (perftest_thedst_neon, perftest_thecst, perftest_thesrc1, loop);
2704 time_speedup = (ne10_float32_t) time_c / time_neon;
2705 time_savings = ( ( (ne10_float32_t) (time_c - time_neon)) / time_c) * 100;
2706 ne10_log (__FUNCTION__,
"%25d%20lld%20lld%19.2f%%%18.2f:1\n", func_loop + 1, time_c, time_neon, time_savings, time_speedup);
2709 free (perftest_guarded_src1);
2710 free (perftest_guarded_cst);
2711 free (perftest_guarded_dst_c);
2712 free (perftest_guarded_dst_neon);
2715 fprintf (stdout,
"----------%30s end\n", __FUNCTION__);
2716 #undef MAX_VEC_COMPONENTS
2778 void test_normalize()
2780 test_normalize_case0();
2805 test_addmat_case0();
2810 test_detmat_case0();
2813 void test_identitymat()
2815 test_identitymat_case0();
2820 test_invmat_case0();
2825 test_mulmat_case0();
2828 void test_mulcmatvec()
2830 test_mulcmatvec_case0();
2835 test_submat_case0();
2838 void test_transmat()
2840 test_transmat_case0();
2843 static void my_test_setup (
void)
2846 ne10_log_buffer_ptr = ne10_log_buffer;
2849 void my_test_teardown (
void)
2854 void test_fixture_math (
void)
2856 test_fixture_start();
2858 fixture_setup (my_test_setup);
2859 fixture_teardown (my_test_teardown);
2861 run_test (test_abs);
2862 run_test (test_addc);
2863 run_test (test_add);
2864 run_test (test_cross);
2865 run_test (test_divc);
2866 run_test (test_div);
2867 run_test (test_dot);
2868 run_test (test_len);
2869 run_test (test_mlac);
2870 run_test (test_mla);
2871 run_test (test_mulc);
2872 run_test (test_mul);
2873 run_test (test_normalize);
2874 run_test (test_rsbc);
2875 run_test (test_setc);
2876 run_test (test_subc);
2877 run_test (test_sub);
2878 run_test (test_addmat);
2879 run_test (test_detmat);
2880 run_test (test_identitymat);
2881 run_test (test_invmat);
2882 run_test (test_mulmat);
2883 run_test (test_mulcmatvec);
2884 run_test (test_submat);
2885 run_test (test_transmat);