36 #error "This intrinsics should only be used in C++ code"
39 #include <yepPredefines.h>
43 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
46 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X86_CPU)
47 #include <x86intrin.h>
51 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
53 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) || defined(YEP_NVIDIA_COMPILER)
56 #error "Unsupported compiler"
61 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
63 #elif defined(YEP_GNU_COMPILER)
65 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
66 __builtin_unreachable();
70 #elif defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
71 __builtin_unreachable();
72 #elif defined(YEP_NVIDIA_COMPILER)
75 #error "Unsupported compiler"
80 return YepSize(pointer) % alignment;
96 #if defined(YEP_NVIDIA_COMPILER)
97 return __double_as_longlong(x);
98 #elif defined(YEP_INTEL_COMPILER)
99 return _castf64_u64(x);
105 float64_word64.float64 = x;
106 return float64_word64.word64;
111 #if defined(YEP_NVIDIA_COMPILER)
112 return __longlong_as_double(x);
113 #elif defined(YEP_INTEL_COMPILER)
114 return _castu64_f64(x);
120 float64_word64.word64 = x;
121 return float64_word64.float64;
126 #if defined(YEP_NVIDIA_COMPILER)
127 return __float_as_int(x);
128 #elif defined(YEP_INTEL_COMPILER)
129 return _castf32_u32(x);
135 float32_word32.float32 = x;
136 return float32_word32.word32;
141 #if defined(YEP_NVIDIA_COMPILER)
142 return __int_as_float(x);
143 #elif defined(YEP_INTEL_COMPILER)
144 return _castu32_f32(x);
150 float32_word32.word32 = x;
151 return float32_word32.float32;
156 const Yep64u signMask = 0x8000000000000000ull;
158 const Yep64u n = yepBuiltin_Cast_64f_64u(x);
160 return n ^ mask ^ signMask;
164 const Yep64u signMask = 0x8000000000000000ull;
166 const Yep64u m = n ^ signMask;
168 return yepBuiltin_Cast_64u_64f(m ^ mask);
172 const Yep32u signMask = 0x80000000u;
174 const Yep32u n = yepBuiltin_Cast_32f_32u(x);
176 return n ^ mask ^ signMask;
180 const Yep32u signMask = 0x80000000u;
182 const Yep32u m = n ^ signMask;
184 return yepBuiltin_Cast_32u_32f(m ^ mask);
188 #if defined(YEP_GNU_COMPILER) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || ((__GNUC__ == 4) && (__GNUC_MINOR__ == 7) && (__GNUC_PATCHLEVEL__ >= 3)))
189 return __builtin_bswap16(n);
190 #elif defined(YEP_CLANG_COMPILER) && ((__clang_major__ > 3) || ((__clang_major__ == 3) && (__clang_minor__ >= 2)))
191 return __builtin_bswap16(n);
192 #elif defined(YEP_INTEL_COMPILER)
194 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
195 return Yep16u(__builtin_bswap32(n << 16));
196 #elif defined(YEP_MSVC_COMPATIBLE_COMPILER)
197 return _byteswap_ushort(n);
204 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_LINUX)
205 return __builtin_bswap32(n);
206 #elif defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)
207 return _byteswap_ulong(n);
208 #elif defined(YEP_NVIDIA_COMPILER)
209 return __byte_perm(n, n, 0x3210);
211 return (n >> 24) | ((n >> 8) & 0x0000FF00u) | ((n << 8) & 0x00FF0000u) | (n << 24);
216 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_LINUX)
217 return __builtin_bswap64(n);
218 #elif defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)
219 return _byteswap_uint64(n);
221 const Yep32u nLo = yepBuiltin_GetLowPart_64u_32u(n);
222 const Yep32u nHi = yepBuiltin_GetHighPart_64u_32u(n);
223 const Yep32u nLoSwapped = yepBuiltin_ByteSwap_32u_32u(nLo);
224 const Yep32u nHiSwapped = yepBuiltin_ByteSwap_32u_32u(nHi);
225 return yepBuiltin_CombineParts_32u32u_64u(nLoSwapped, nHiSwapped);
230 #if defined(YEP_MICROSOFT_COMPILER)
232 #elif defined(YEP_INTEL_COMPILER)
234 #elif defined(YEP_ARM_COMPILER)
236 #elif defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER)
237 return __builtin_fabsf(x);
238 #elif defined(YEP_NVIDIA_COMPILER)
241 return x >= 0.0f ? x : -x;
246 #if defined(YEP_MICROSOFT_COMPILER)
248 #elif defined(YEP_INTEL_COMPILER)
250 #elif defined(YEP_ARM_COMPILER)
252 #elif defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER)
253 return __builtin_fabs(x);
254 #elif defined(YEP_NVIDIA_COMPILER)
257 return x >= 0.0 ? x : -x;
302 #if defined(YEP_ARM_CPU)
304 #elif defined(YEP_MIPS_CPU)
312 #if defined(YEP_ARM_CPU)
328 #if defined(YEP_ARM_CPU)
336 #if defined(YEP_ARM_CPU)
344 return (a > b) ? b : a;
348 return (a > b) ? b : a;
352 return (a > b) ? b : a;
356 return (a > b) ? b : a;
360 return (a > b) ? b : a;
364 return (a > b) ? b : a;
368 return (a > b) ? b : a;
372 return (a > b) ? b : a;
376 #if defined(YEP_NVIDIA_COMPILER)
378 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE_EXTENSION)
379 return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(a), _mm_set_ss(b)));
380 #elif defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FPU_INSTRUCTIONS)
382 return (a < b) ? a : b;
387 Yep32u au = yepBuiltin_Cast_32f_32u(a);
388 Yep32u bu = yepBuiltin_Cast_32f_32u(b);
391 const Yep32u twoBu = bu + bu;
398 const Yep32u twoAu = au + au;
404 const Yep32s as =
Yep32s(au) >= 0 ? au : 0x80000000 - au;
405 const Yep32s bs =
Yep32s(bu) >= 0 ? bu : 0x80000000 - bu;
406 return as < bs ? yepBuiltin_Cast_32u_32f(au) : yepBuiltin_Cast_32u_32f(bu);
411 #if defined(YEP_NVIDIA_COMPILER)
413 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE2_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE2_EXTENSION)
414 return _mm_cvtsd_f64(_mm_min_sd(_mm_set_sd(a), _mm_set_sd(b)));
415 #elif defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FPU_INSTRUCTIONS)
417 return (a < b) ? a : b;
422 Yep64u au = yepBuiltin_Cast_64f_64u(a);
423 Yep64u bu = yepBuiltin_Cast_64f_64u(b);
426 const Yep64u negBu = bu | 0x8000000000000000ull;
433 const Yep64u negAu = au | 0x8000000000000000ull;
439 const Yep64s as =
Yep64s(au) >= 0ll ? au : 0x8000000000000000ll - au;
440 const Yep64s bs =
Yep64s(bu) >= 0ll ? bu : 0x8000000000000000ll - bu;
441 return as < bs ? yepBuiltin_Cast_64u_64f(au) : yepBuiltin_Cast_64u_64f(bu);
446 return (a > b) ? a : b;
450 return (a > b) ? a : b;
454 return (a > b) ? a : b;
458 return (a > b) ? a : b;
462 return (a > b) ? a : b;
466 return (a > b) ? a : b;
470 return (a > b) ? a : b;
474 return (a > b) ? a : b;
478 #if defined(YEP_NVIDIA_COMPILER)
480 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE_EXTENSION)
481 return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(a), _mm_set_ss(b)));
482 #elif defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FPU_INSTRUCTIONS)
484 return (a > b) ? a : b;
489 Yep32u au = yepBuiltin_Cast_32f_32u(a);
490 Yep32u bu = yepBuiltin_Cast_32f_32u(b);
493 const Yep32u twoBu = bu + bu;
500 const Yep32u twoAu = au + au;
506 const Yep32s as =
Yep32s(au) >= 0 ? au : 0x80000000 - au;
507 const Yep32s bs =
Yep32s(bu) >= 0 ? bu : 0x80000000 - bu;
508 return as > bs ? yepBuiltin_Cast_32u_32f(au) : yepBuiltin_Cast_32u_32f(bu);
513 #if defined(YEP_NVIDIA_COMPILER)
515 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE2_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE2_EXTENSION)
516 return _mm_cvtsd_f64(_mm_max_sd(_mm_set_sd(a), _mm_set_sd(b)));
517 #elif defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FPU_INSTRUCTIONS)
519 return (a > b) ? a : b;
524 Yep64u au = yepBuiltin_Cast_64f_64u(a);
525 Yep64u bu = yepBuiltin_Cast_64f_64u(b);
528 const Yep64u negBu = bu | 0x8000000000000000ull;
535 const Yep64u negAu = au | 0x8000000000000000ull;
541 const Yep64s as =
Yep64s(au) >= 0ll ? au : 0x8000000000000000ll - au;
542 const Yep64s bs =
Yep64s(bu) >= 0ll ? bu : 0x8000000000000000ll - bu;
543 return as > bs ? yepBuiltin_Cast_64u_64f(au) : yepBuiltin_Cast_64u_64f(bu);
548 return (x < xMin) ? xMin : (x > xMax) ? xMax : x;
552 return (x < xMin) ? xMin : (x > xMax) ? xMax : x;
564 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
565 return __builtin_inff();
567 const static Yep64f one = 1.0f;
568 const static Yep64f zero = 0.0f;
574 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
575 return __builtin_inf();
577 const static Yep64f one = 1.0;
578 const static Yep64f zero = 0.0;
584 return -yepBuiltin_PositiveInfinity_32f();
588 return -yepBuiltin_PositiveInfinity_64f();
592 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
593 return __builtin_nanf(
"");
595 const static Yep32f zero = 0.0f;
601 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
602 return __builtin_nan(
"");
604 const static Yep64f zero = 0.0;
610 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
611 return __builtin_clzl(x);
612 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)) && (defined(YEP_IA64_ABI) || defined(YEP_X64_ABI))
616 unsigned long bitPosition;
617 _BitScanReverse64(&bitPosition, x);
618 return 63u - bitPosition;
620 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)) && defined(YEP_X86_CPU)
621 const Yep32u xHi = yepBuiltin_GetHighPart_64u_32u(x);
622 const Yep32u xLo = yepBuiltin_GetLowPart_64u_32u(x);
623 unsigned long bitPositionHi, bitPositionLo;
624 _BitScanReverse(&bitPositionLo, xLo);
625 _BitScanReverse(&bitPositionHi, xHi);
630 return 63u - bitPositionLo;
633 return 31u - bitPositionHi;
635 #elif defined(YEP_NVIDIA_COMPILER)
638 #error "Compiler-specific implementation needed"
643 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
644 return __builtin_clz(x);
645 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS))
649 unsigned long bitPosition;
650 _BitScanReverse(&bitPosition, x);
651 return 63u - bitPosition;
653 #elif defined(YEP_NVIDIA_COMPILER)
656 #error "Compiler-specific implementation needed"
661 const Yep64f absX = yepBuiltin_Abs_64f_64f(x);
662 if YEP_LIKELY(absX < yepBuiltin_PositiveInfinity_64f()) {
663 return yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(absX) + 1ull) - absX;
670 const Yep32f absX = yepBuiltin_Abs_32f_32f(x);
671 if YEP_LIKELY(absX < yepBuiltin_PositiveInfinity_32f()) {
672 return yepBuiltin_Cast_32u_32f(yepBuiltin_Cast_32f_32u(absX) + 1u) - absX;
678 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
680 return __builtin_fma(a, b, c);
684 return __builtin_fma(a, b, -c);
688 return __builtin_fma(-a, b, c);
692 return __builtin_fma(-a, b, -c);
694 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
696 return __fma_rn(a, b, c);
700 return __fma_rn(a, b, -c);
704 return __fma_rn(-a, b, c);
708 return __fma_rn(-a, b, -c);
712 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
714 return __builtin_fmaf(a, b, c);
718 return __builtin_fmaf(a, b, -c);
722 return __builtin_fmaf(-a, b, c);
726 return __builtin_fmaf(-a, b, -c);
728 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
730 return __fmaf_rn(a, b, c);
734 return __fmaf_rn(a, b, -c);
738 return __fmaf_rn(-a, b, c);
742 return __fmaf_rn(-a, b, -c);
746 #if defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
748 const Yep64f q = y * rcpC;
749 const Yep64f r = yepBuiltin_FNMA_64f64f64f_64f(c, q, y);
750 return yepBuiltin_FMA_64f64f64f_64f(r, rcpC, q);
754 return yepBuiltin_FMA_64f64f64f_64f(a, b, c);
758 return yepBuiltin_FMS_64f64f64f_64f(a, b, c);
774 #if defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
776 const Yep32f q = y * rcpC;
777 const Yep32f r = yepBuiltin_FNMA_32f32f32f_32f(c, q, y);
778 return yepBuiltin_FMA_32f32f32f_32f(r, rcpC, q);
782 return yepBuiltin_FMA_32f32f32f_32f(a, b, c);
786 return yepBuiltin_FMS_32f32f32f_32f(a, b, c);
805 #if defined(YEP_NVIDIA_COMPILER)
807 sum.high = __dadd_rn(a, b);
808 const Yep64f bCorrected = __dadd_rn(sum.high, -a);
809 const Yep64f deltaB = __dadd_rn(b, -bCorrected);
815 const Yep64f bCorrected = sum.high - a;
816 const Yep64f deltaB = b - bCorrected;
824 #if defined(YEP_NVIDIA_COMPILER)
826 sum.high = __dadd_rn(a, b);
827 const Yep64f aCorrected = __dadd_rn(sum.high, -b);
828 const Yep64f bCorrected = __dadd_rn(sum.high, -aCorrected);
829 const Yep64f deltaA = __dadd_rn(a, -aCorrected);
830 const Yep64f deltaB = __dadd_rn(b, -bCorrected);
831 sum.low = __dadd_rn(deltaA, deltaB);
836 const Yep64f aCorrected = sum.high - b;
837 const Yep64f bCorrected = sum.high - aCorrected;
838 const Yep64f deltaA = a - aCorrected;
839 const Yep64f deltaB = b - bCorrected;
840 sum.low = deltaA + deltaB;
848 product.high = a * b;
849 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
850 product.low = yepBuiltin_FMS_64f64f64f_64f(a, b, product.high);
854 const Yep64u mask = 0xFFFFFFFFF8000000ull;
855 da.high = yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(a) & mask);
856 da.low = a - da.high;
857 db.high = yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(b) & mask);
858 db.low = b - da.high;
859 const Yep64f t1 = -product.high +
Yep64f(da.high * db.high);
862 product.low = t3 +
Yep64f(da.low * db.low);
868 const Yep32u defaultExponent = 0x3F800000u;
869 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
870 const Yep32f sqrt2 = 0x1.6A09E6p+0f;
871 const Yep32f c2 = -0x1.FFFFF2p-2f;
872 const Yep32f c3 = 0x1.55571Ep-2f;
873 const Yep32f c4 = -0x1.0006B2p-2f;
874 const Yep32f c5 = 0x1.98CB06p-3f;
875 const Yep32f c6 = -0x1.530B6Ap-3f;
876 const Yep32f c7 = 0x1.317FD6p-3f;
877 const Yep32f c8 = -0x1.26F724p-3f;
878 const Yep32f c9 = 0x1.6A66D0p-4f;
880 const Yep32f ln2_hi = 0x1.62E400p-1f;
881 const Yep32f ln2_lo = 0x1.7F7D1Cp-20f;
883 const Yep32f sqrt2 = 1.41421353816986083984375f;
884 const Yep32f c2 = -0.4999997913837432861328125f;
885 const Yep32f c3 = 0.3333401381969451904296875f;
886 const Yep32f c4 = -0.2500255405902862548828125f;
887 const Yep32f c5 = 0.19960598647594451904296875f;
888 const Yep32f c6 = -0.16554911434650421142578125f;
889 const Yep32f c7 = 0.14916960895061492919921875f;
890 const Yep32f c8 = -0.1440260708332061767578125f;
891 const Yep32f c9 = 8.8476955890655517578125e-2f;
893 const Yep32f ln2_hi = 0.693145751953125f;
894 const Yep32f ln2_lo = 1.428606765330187045037746429443359375e-6f;
899 const Yep32s xWord = yepBuiltin_Cast_32f_32u(x);
902 return yepBuiltin_NaN_32f();
905 return yepBuiltin_NegativeInfinity_32f();
915 mantissa = xWord & 0x007FFFFFu;
918 const Yep32u pointOffset = yepBuiltin_Nlz_32u_32u(xWord) - 7u;
919 exponent = -126 -
Yep32s(pointOffset);
920 mantissa = (xWord << pointOffset) & 0x007FFFFFu;
922 x = yepBuiltin_Cast_32u_32f(defaultExponent | mantissa);
927 const Yep32f t = x - 1.0f;
929 return (t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * (c6 + t * (c7 + t * (c8 + t * c9)))))))) +
930 dexp * ln2_lo) + dexp * ln2_hi;
935 const Yep64u defaultExponent = 0x3FF0000000000000ull;
936 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
937 const Yep64f sqrt2 = 0x1.6A09E667F3BCDp+0;
938 const Yep64f c2 = -0x1.FFFFFFFFFFFF2p-2;
939 const Yep64f c3 = 0x1.5555555555103p-2;
940 const Yep64f c4 = -0x1.00000000013C7p-2;
941 const Yep64f c5 = 0x1.9999999A43E4Fp-3;
942 const Yep64f c6 = -0x1.55555554A6A2Bp-3;
943 const Yep64f c7 = 0x1.249248DAE4B2Ap-3;
944 const Yep64f c8 = -0x1.FFFFFFBD8606Dp-4;
945 const Yep64f c9 = 0x1.C71C90DB06248p-4;
946 const Yep64f c10 = -0x1.9999C5BE751E3p-4;
947 const Yep64f c11 = 0x1.745980F3FB889p-4;
948 const Yep64f c12 = -0x1.554D5ACD502ABp-4;
949 const Yep64f c13 = 0x1.3B4ED39194B87p-4;
950 const Yep64f c14 = -0x1.25480A82633AFp-4;
951 const Yep64f c15 = 0x1.0F23916A44515p-4;
952 const Yep64f c16 = -0x1.EED2E2BB64B2Ep-5;
953 const Yep64f c17 = 0x1.EA17E14773369p-5;
954 const Yep64f c18 = -0x1.1654764F478ECp-4;
955 const Yep64f c19 = 0x1.0266CD08DB2F2p-4;
956 const Yep64f c20 = -0x1.CC4EC078138E3p-6;
958 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
959 const Yep64df ln2 = { 0x1.62E42FEFA39EFp-1, 0x1.ABC9E3B39803Fp-56 };
961 const Yep64df ln2 = { 0x1.62E42FEFA3800p-1, 0x1.EF35793C76730p-45 };
964 const Yep64f sqrt2 = 1.4142135623730951;
965 const Yep64f c2 = -0.4999999999999992;
966 const Yep64f c3 = 0.3333333333332719;
967 const Yep64f c4 = -0.25000000000028105;
968 const Yep64f c5 = 0.20000000001936022;
969 const Yep64f c6 = -0.16666666664680582;
970 const Yep64f c7 = 0.14285714071282857;
971 const Yep64f c8 = -0.12499999903264021;
972 const Yep64f c9 = 0.11111122688488095;
973 const Yep64f c10 = -0.10000016444912023;
974 const Yep64f c11 = 0.09090566990173178;
975 const Yep64f c12 = -0.08332572431118972;
976 const Yep64f c13 = 0.07697947162641415;
977 const Yep64f c14 = -0.0716019068260738;
978 const Yep64f c15 = 0.06619602968955392;
979 const Yep64f c16 = -0.060403292499492486;
980 const Yep64f c17 = 0.059825839994664794;
981 const Yep64f c18 = -0.06795164313050223;
982 const Yep64f c19 = 0.06308631984365912;
983 const Yep64f c20 = -0.028094947774939604;
985 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
986 const Yep64df ln2 = { 0.6931471805599453, 2.3190468138462996e-17 };
988 const Yep64df ln2 = { 0.6931471805598903, 5.497923018708371e-14 };
993 const Yep64s xWord = yepBuiltin_Cast_64f_64u(x);
994 if YEP_LIKELY(xWord >= 0x0010000000000000ull) {
996 exponent =
Yep32s((yepBuiltin_GetHighPart_64u_32u(
Yep64u(xWord)) >> 20) & 0x7FFu) - 1023;
997 mantissa = xWord & 0x000FFFFFFFFFFFFFull;
1000 const Yep32u pointOffset = yepBuiltin_Nlz_64u_32u(xWord) - 11u;
1001 exponent = -1022 -
Yep32s(pointOffset);
1002 mantissa = (xWord << pointOffset) & 0x000FFFFFFFFFFFFFull;
1004 x = yepBuiltin_Cast_64u_64f(defaultExponent | mantissa);
1009 const Yep64f t = x - 1.0;
1010 const Yep64f dexp = yepBuiltin_Convert_32s_64f(exponent);
1011 const Yep64f pt = yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1012 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1013 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1014 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1015 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1016 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1017 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1018 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1019 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1020 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1021 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1022 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1023 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1024 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1025 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1026 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1027 yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1028 yepBuiltin_MultiplyAdd_64f64f64f_64f(t, c20, c19),
1046 const Yep64f rf = yepBuiltin_MultiplyAdd_64f64f64f_64f(t, t * pt, t);
1047 Yep64f f = yepBuiltin_MultiplyAdd_64f64f64f_64f(dexp, ln2.high, yepBuiltin_MultiplyAdd_64f64f64f_64f(dexp, ln2.low, rf));
1050 f = yepBuiltin_NaN_64f();
1053 f = yepBuiltin_NegativeInfinity_64f();
1054 }
else if YEP_UNLIKELY(!(x < yepBuiltin_PositiveInfinity_64f())) {
1062 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1063 const Yep32f magicBias = 0x1.000000p+23f;
1064 const Yep32f zeroCutoff = -0x1.9FE368p+6f;
1065 const Yep32f infCutoff = 0x1.62E42Ep+6f;
1066 const Yep32f log2e = 0x1.715476p+0f;
1067 const Yep32f ln2_hi = 0x1.62E400p-1f;
1068 const Yep32f ln2_lo = 0x1.7F7D1Cp-20f;
1070 const Yep32f c2 = 0x1.FFFFFCp-2f;
1071 const Yep32f c3 = 0x1.55548Cp-3f;
1072 const Yep32f c4 = 0x1.555834p-5f;
1073 const Yep32f c5 = 0x1.123CFEp-7f;
1074 const Yep32f c6 = 0x1.6ADCAEp-10f;
1076 const Yep32f magicBias = 8388608.0f;
1077 const Yep32f zeroCutoff = -1.03972076416015625e+2f;
1078 const Yep32f infCutoff = 8.872283172607421875e+1f;
1079 const Yep32f log2e = 1.44269502162933349609375f;
1080 const Yep32f ln2_hi = 0.693145751953125f;
1081 const Yep32f ln2_lo = 1.428606765330187045037746429443359375e-6f;
1083 const Yep32f c2 = 0.499999940395355224609375f;
1084 const Yep32f c3 = 0.1666651666164398193359375f;
1085 const Yep32f c4 = 4.1668035089969635009765625e-2f;
1086 const Yep32f c5 = 8.369087241590023040771484375e-3f;
1087 const Yep32f c6 = 1.384208793751895427703857421875e-3f;
1093 Yep32f t = x * log2e + magicBias;
1094 Yep32u e1 = yepBuiltin_Cast_32f_32u(t) << 23;
1096 e1 = yepBuiltin_Clamp_32s32s32s_32s(e1, -126 << 23, 127 << 23);
1098 const Yep32f s1 = yepBuiltin_Cast_32u_32f(e1 + 0x3F800000u);
1099 const Yep32f s2 = yepBuiltin_Cast_32u_32f(e2 + 0x3F800000u);
1101 const Yep32f rx = (x - t * ln2_hi) - t * ln2_lo;
1102 const Yep32f rf = rx + rx * rx * (c2 + rx * (c3 + rx * (c4 + rx * (c5 + rx * c6))));
1103 Yep32f f = s2 * (s1 * rf + s1);
1105 f = yepBuiltin_PositiveInfinity_32f();
1116 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1117 const Yep64f magicBias = 0x1.8000000000000p+52;
1118 const Yep64f log2e = 0x1.71547652B82FEp+0;
1119 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
1120 const Yep64df ln2 = { 0x1.62E42FEFA39EFp-1, 0x1.ABC9E3B39803Fp-56 };
1122 const Yep64df ln2 = { 0x1.62E42FEFA3800p-1, 0x1.EF35793C76730p-45 };
1124 const Yep64f c2 = 0x1.0000000000005p-1;
1125 const Yep64f c3 = 0x1.5555555555540p-3;
1126 const Yep64f c4 = 0x1.5555555552115p-5;
1127 const Yep64f c5 = 0x1.11111111173CAp-7;
1128 const Yep64f c6 = 0x1.6C16C17F2BF99p-10;
1129 const Yep64f c7 = 0x1.A01A017EEB164p-13;
1130 const Yep64f c8 = 0x1.A019A6AC02A7Dp-16;
1131 const Yep64f c9 = 0x1.71DE71651CE7Ap-19;
1132 const Yep64f c10 = 0x1.28A284098D813p-22;
1133 const Yep64f c11 = 0x1.AE9043CA87A40p-26;
1135 const Yep64f zeroCutoff = -0x1.74910D52D3051p+9;
1136 const Yep64f infCutoff = 0x1.62E42FEFA39EFp+9;
1138 const Yep64f magicBias = 6755399441055744.0;
1139 const Yep64f log2e = 1.4426950408889634;
1140 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
1141 const Yep64df ln2 = { 0.6931471805599453, 2.3190468138462996e-17 };
1143 const Yep64df ln2 = { 0.6931471805598903, 5.497923018708371e-14 };
1145 const Yep64f c2 = 0.5000000000000006;
1146 const Yep64f c3 = 0.16666666666666607;
1147 const Yep64f c4 = 0.04166666666657385;
1148 const Yep64f c5 = 0.008333333333377175;
1149 const Yep64f c6 = 0.0013888888932278352;
1150 const Yep64f c7 = 0.0001984126974695729;
1151 const Yep64f c8 = 2.4801504579877947e-5;
1152 const Yep64f c9 = 2.755738182142102e-6;
1153 const Yep64f c10 = 2.762627110160372e-7;
1154 const Yep64f c11 = 2.5062096212675488e-8;
1156 const Yep64f zeroCutoff = -745.1332191019411;
1157 const Yep64f infCutoff = 709.7827128933840;
1163 Yep64f t = x * log2e + magicBias;
1164 Yep32u e1 = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t)) << 20;
1166 e1 = yepBuiltin_Clamp_32s32s32s_32s(e1, -1022 << 20, 1023 << 20);
1168 const Yep64f s1 = yepBuiltin_Cast_64u_64f(yepBuiltin_CombineParts_32u32u_64u(e1 + 0x3FF00000u, 0u));
1169 const Yep64f s2 = yepBuiltin_Cast_64u_64f(yepBuiltin_CombineParts_32u32u_64u(e2 + 0x3FF00000u, 0u));
1171 const Yep64f rx = (x - t * ln2.high) - t * ln2.low;
1172 const Yep64f px = yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1173 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1174 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1175 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1176 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1177 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1178 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1179 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1180 yepBuiltin_MultiplyAdd_64f64f64f_64f(rx, c11, c10),
1189 const Yep64f rf = yepBuiltin_MultiplyAdd_64f64f64f_64f(rx, rx * px, rx);
1190 Yep64f f = s2 * yepBuiltin_MultiplyAdd_64f64f64f_64f(s1, rf, s1);
1192 f = yepBuiltin_PositiveInfinity_64f();
1202 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1203 const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1204 const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1205 const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1206 const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1207 const Yep64f magicBias = 0x1.8000000000000p+52;
1209 const Yep64f c0 = 0x1.0000000000000p+0;
1210 const Yep64f c2 = -0x1.0000000000000p-1;
1211 const Yep64f c3 = -0x1.5555555555546p-3;
1212 const Yep64f c4 = 0x1.555555555554Bp-5;
1213 const Yep64f c5 = 0x1.111111110F51Ep-7;
1214 const Yep64f c6 = -0x1.6C16C16C15038p-10;
1215 const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1216 const Yep64f c8 = 0x1.A01A019C94874p-16;
1217 const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1218 const Yep64f c10 = -0x1.27E4F7F65104Fp-22;
1219 const Yep64f c11 = -0x1.AE5E38936D046p-26;
1220 const Yep64f c12 = 0x1.1EE9DF6693F7Ep-29;
1221 const Yep64f c13 = 0x1.5D8711D281543p-33;
1222 const Yep64f c14 = -0x1.8FA87EF79AE3Fp-37;
1224 const Yep64f minusPio2_hi = -1.5707963267923333;
1225 const Yep64f minusPio2_me = -2.5633441515971907e-12;
1226 const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1227 const Yep64f twoOPi = 0.6366197723675814;
1228 const Yep64f magicBias = 6755399441055744.0;
1232 const Yep64f c3 = -0.16666666666666624;
1233 const Yep64f c4 = 0.041666666666666595;
1234 const Yep64f c5 = 0.008333333333320921;
1235 const Yep64f c6 = -0.0013888888888873418;
1236 const Yep64f c7 = -0.0001984126982882608;
1237 const Yep64f c8 = 2.480158728907678e-05;
1238 const Yep64f c9 = 2.755731339913502e-06;
1239 const Yep64f c10 = -2.755731424340092e-07;
1240 const Yep64f c11 = -2.505071756776031e-08;
1241 const Yep64f c12 = 2.0875709384133097e-09;
1242 const Yep64f c13 = 1.58946757299646e-10;
1243 const Yep64f c14 = -1.135896887279365e-11;
1245 Yep64f t = x * twoOPi + magicBias;
1246 const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1248 x += t * minusPio2_hi;
1250 const Yep64f midProduct = t * minusPio2_me;
1252 const Yep64f r = midProduct - (x - a);
1253 x += (t * minusPio2_lo + r);
1255 const Yep64f sqrX = x * x;
1257 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1258 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1259 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1260 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1261 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1263 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1265 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c12);
1266 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c10);
1267 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c8);
1268 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c6);
1269 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c4);
1270 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c2);
1271 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c0);
1273 const Yep64f f = (n & 1) ? cosX : sinX;
1274 return (n & 2) ? -f : f;
1278 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1279 const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1280 const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1281 const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1282 const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1283 const Yep64f magicBias = 0x1.8000000000000p+52;
1285 const Yep64f c0 = -0x1.0000000000000p+0;
1286 const Yep64f c2 = 0x1.0000000000000p-1;
1287 const Yep64f c3 = -0x1.5555555555546p-3;
1288 const Yep64f c4 = -0x1.555555555554Bp-5;
1289 const Yep64f c5 = 0x1.111111110F51Ep-7;
1290 const Yep64f c6 = 0x1.6C16C16C15038p-10;
1291 const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1292 const Yep64f c8 = -0x1.A01A019C94874p-16;
1293 const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1294 const Yep64f c10 = 0x1.27E4F7F65104Fp-22;
1295 const Yep64f c11 = -0x1.AE5E38936D046p-26;
1296 const Yep64f c12 = -0x1.1EE9DF6693F7Ep-29;
1297 const Yep64f c13 = 0x1.5D8711D281543p-33;
1298 const Yep64f c14 = 0x1.8FA87EF79AE3Fp-37;
1300 const Yep64f minusPio2_hi = -1.5707963267923333;
1301 const Yep64f minusPio2_me = -2.5633441515971907e-12;
1302 const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1303 const Yep64f twoOPi = 0.6366197723675814;
1304 const Yep64f magicBias = 6755399441055744.0;
1308 const Yep64f c3 = -0.16666666666666624;
1309 const Yep64f c4 = -0.041666666666666595;
1310 const Yep64f c5 = 0.008333333333320921;
1311 const Yep64f c6 = 0.0013888888888873418;
1312 const Yep64f c7 = -0.0001984126982882608;
1313 const Yep64f c8 = -2.480158728907678e-05;
1314 const Yep64f c9 = 2.755731339913502e-06;
1315 const Yep64f c10 = 2.755731424340092e-07;
1316 const Yep64f c11 = -2.505071756776031e-08;
1317 const Yep64f c12 = -2.0875709384133097e-09;
1318 const Yep64f c13 = 1.58946757299646e-10;
1319 const Yep64f c14 = 1.135896887279365e-11;
1321 Yep64f t = x * twoOPi + magicBias;
1322 const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1324 x += t * minusPio2_hi;
1326 const Yep64f midProduct = t * minusPio2_me;
1328 const Yep64f r = midProduct - (x - a);
1329 x += (t * minusPio2_lo + r);
1331 const Yep64f sqrX = x * x;
1333 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1334 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1335 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1336 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1337 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1339 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1341 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c12);
1342 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c10);
1343 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c8);
1344 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c6);
1345 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c4);
1346 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c2);
1347 minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c0);
1349 const Yep64f f = (n & 1) ? sinX : minusCosX;
1350 return (n & 2) ? f : -f;
1354 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1355 const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1356 const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1357 const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1358 const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1359 const Yep64f magicBias = 0x1.8000000000000p+52;
1361 const Yep64f c0 = 0x1.0000000000000p+0;
1362 const Yep64f c2 = -0x1.0000000000000p-1;
1363 const Yep64f c3 = -0x1.5555555555546p-3;
1364 const Yep64f c4 = 0x1.555555555554Bp-5;
1365 const Yep64f c5 = 0x1.111111110F51Ep-7;
1366 const Yep64f c6 = -0x1.6C16C16C15038p-10;
1367 const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1368 const Yep64f c8 = 0x1.A01A019C94874p-16;
1369 const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1370 const Yep64f c10 = -0x1.27E4F7F65104Fp-22;
1371 const Yep64f c11 = -0x1.AE5E38936D046p-26;
1372 const Yep64f c12 = 0x1.1EE9DF6693F7Ep-29;
1373 const Yep64f c13 = 0x1.5D8711D281543p-33;
1374 const Yep64f c14 = -0x1.8FA87EF79AE3Fp-37;
1376 const Yep64f minusPio2_hi = -1.5707963267923333;
1377 const Yep64f minusPio2_me = -2.5633441515971907e-12;
1378 const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1379 const Yep64f twoOPi = 0.6366197723675814;
1380 const Yep64f magicBias = 6755399441055744.0;
1384 const Yep64f c3 = -0.16666666666666624;
1385 const Yep64f c4 = 0.041666666666666595;
1386 const Yep64f c5 = 0.008333333333320921;
1387 const Yep64f c6 = -0.0013888888888873418;
1388 const Yep64f c7 = -0.0001984126982882608;
1389 const Yep64f c8 = 2.480158728907678e-05;
1390 const Yep64f c9 = 2.755731339913502e-06;
1391 const Yep64f c10 = -2.755731424340092e-07;
1392 const Yep64f c11 = -2.505071756776031e-08;
1393 const Yep64f c12 = 2.0875709384133097e-09;
1394 const Yep64f c13 = 1.58946757299646e-10;
1395 const Yep64f c14 = -1.135896887279365e-11;
1398 Yep64f t = x * twoOPi + magicBias;
1399 const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1401 x += t * minusPio2_hi;
1403 const Yep64f midProduct = t * minusPio2_me;
1405 const Yep64f r = midProduct - (x - a);
1406 x += (t * minusPio2_lo + r);
1408 const Yep64f sqrX = x * x;
1410 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1411 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1412 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1413 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1414 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1416 sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1418 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c12);
1419 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c10);
1420 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c8);
1421 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c6);
1422 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c4);
1423 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c2);
1424 cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c0);
1426 return (n & 1) ? (-cosX / sinX) : (sinX / cosX);
1429 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1430 #pragma intrinsic(sqrt)
1434 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1436 #elif defined(YEP_NVIDIA_COMPILER)
1437 return __fsqrt_rn(x);
1438 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
1439 return __builtin_sqrtf(x);
1440 #elif defined(YEP_ARM_COMPILER)
1443 #error "Compiler-specific implementation needed"
1448 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1450 #elif defined(YEP_NVIDIA_COMPILER)
1451 return __dsqrt_rn(x);
1452 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
1453 return __builtin_sqrt(x);
1454 #elif defined(YEP_ARM_COMPILER)
1457 #error "Compiler-specific implementation needed"
1462 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1463 const Yep64f half = 0x1.0000000000000p-1;
1464 const Yep64f ac3 = 0x1.5555555555332p-3;
1465 const Yep64f ac5 = 0x1.33333333768C7p-4;
1466 const Yep64f ac7 = 0x1.6DB6DB3E4DA8Ap-5;
1467 const Yep64f ac9 = 0x1.F1C72D5B739EFp-6;
1468 const Yep64f ac11 = 0x1.6E89DC94F7B19p-6;
1469 const Yep64f ac13 = 0x1.1C6D1EE2BF355p-6;
1470 const Yep64f ac15 = 0x1.C6E7A6CA04E0Dp-7;
1471 const Yep64f ac17 = 0x1.8F47A67BD13CFp-7;
1472 const Yep64f ac19 = 0x1.A7AC3B4A38FB8p-8;
1473 const Yep64f ac21 = 0x1.4296C857308B2p-6;
1474 const Yep64f ac23 = -0x1.0DB1C05152E38p-6;
1475 const Yep64f ac25 = 0x1.06AD1B749C8D4p-5;
1477 const Yep64f bc0 = 0x1.921FB54442D18p+0;
1478 const Yep64f bc1 = 0x1.6A09E667F3BC7p+1;
1479 const Yep64f bc3 = -0x1.E2B7DDDFF06ACp-2;
1480 const Yep64f bc5 = 0x1.B27247B01E1B8p-3;
1481 const Yep64f bc7 = -0x1.02995B468EBC5p-3;
1482 const Yep64f bc9 = 0x1.5FFB7742ECDC6p-4;
1483 const Yep64f bc11 = -0x1.032E1D4CDEC75p-4;
1484 const Yep64f bc13 = 0x1.924AF9192AF6Ap-5;
1485 const Yep64f bc15 = -0x1.41264A779EBFFp-5;
1486 const Yep64f bc17 = 0x1.1D9B9AF0438A1p-5;
1487 const Yep64f bc19 = -0x1.106A0643EEB6Cp-6;
1488 const Yep64f bc21 = 0x1.EBCC69FBEBEC2p-5;
1489 const Yep64f bc23 = 0x1.B2DE37FA33AAAp-5;
1490 const Yep64f bc25 = 0x1.8509940B63DD2p-4;
1493 const Yep64f ac3 = 0.16666666666665148;
1494 const Yep64f ac5 = 0.07500000000382832;
1495 const Yep64f ac7 = 0.044642856797897215;
1496 const Yep64f ac9 = 0.03038196019570621;
1497 const Yep64f ac11 = 0.02237173596574413;
1498 const Yep64f ac13 = 0.017360000764699752;
1499 const Yep64f ac15 = 0.013882595481880445;
1500 const Yep64f ac17 = 0.01218505505642922;
1501 const Yep64f ac19 = 0.006464733576851893;
1502 const Yep64f ac21 = 0.019689269681074158;
1503 const Yep64f ac23 = -0.016460836229539505;
1504 const Yep64f ac25 = 0.03206496584324872;
1506 const Yep64f bc0 = 1.5707963267948966;
1507 const Yep64f bc1 = 2.8284271247461876;
1508 const Yep64f bc3 = -0.4714045207912061;
1509 const Yep64f bc5 = 0.21213203436105998;
1510 const Yep64f bc7 = -0.12626906689714992;
1511 const Yep64f bc9 = 0.08593317591185387;
1512 const Yep64f bc11 = -0.0632764000455824;
1513 const Yep64f bc13 = 0.04910801555646922;
1514 const Yep64f bc15 = -0.03920282883060366;
1515 const Yep64f bc17 = 0.034864237417523876;
1516 const Yep64f bc19 = -0.01662684070445712;
1517 const Yep64f bc21 = 0.060033995628484785;
1518 const Yep64f bc23 = 0.053084477740062155;
1519 const Yep64f bc25 = 0.0949798377025595;
1521 const Yep64f absX = yepBuiltin_Abs_64f_64f(x);
1525 const Yep64f ax2 = ax * ax;
1527 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac23);
1528 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac21);
1529 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac19);
1530 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac17);
1531 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac15);
1532 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac13);
1533 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac11);
1534 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac9);
1535 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac7);
1536 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac5);
1537 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac3);
1538 af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af * ax2, ax, ax);
1541 const Yep64f bx2 = absX * half - half;
1542 const Yep64f bx = yepBuiltin_Sqrt_64f_64f(bx2);
1544 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc23);
1545 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc21);
1546 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc19);
1547 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc17);
1548 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc15);
1549 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc13);
1550 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc11);
1551 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc9);
1552 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc7);
1553 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc5);
1554 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc3);
1555 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc1);
1556 bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx, bc0);
1557 return x > 0.0 ? bf : -bf;
1560 if (yepBuiltin_IsNaN_64f(absX)) {
1563 return yepBuiltin_NaN_64f();
1569 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1570 return __emulu(x, y);
1577 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1578 return __emul(x, y);
1585 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1586 return Yep32u(__emulu(x, y) >> 32);
1587 #elif defined(YEP_NVIDIA_COMPILER)
1588 return __umulhi(x, y);
1595 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1597 #elif defined(YEP_NVIDIA_COMPILER)
1598 return __mulhi(x, y);
1604 #if defined(YEP_MSVC_COMPATIBLE_COMPILER) && (defined(YEP_X64_ABI) || defined(YEP_IA64_ABI))
1607 result.low = _umul128(x, y, &result.high);
1610 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X64_ABI)
1612 const __uint128_t product = ((__uint128_t)x) * ((__uint128_t)y);
1614 result.low =
Yep64u(product);
1615 result.high =
Yep64u(product >> 64);
1618 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_CUDA_GPU)
1622 result.high = __umul64hi(x, y);
1627 #if defined(YEP_MSVC_COMPATIBLE_COMPILER) && (defined(YEP_X64_ABI) || defined(YEP_IA64_ABI))
1631 result.low = _mul128(x, y, &highPart);
1632 result.high = highPart;
1635 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X64_ABI)
1637 const __int128_t product = ((__int128_t)x) * ((__int128_t)y);
1639 result.low =
Yep64u(product);
1640 result.high =
Yep64u(((__uint128_t)product) >> 64);
1643 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_CUDA_GPU)
1647 result.high = __mul64hi(x, y);
1653 #if defined(YEP_X86_CPU)
1654 #if defined(YEP_GCC_COMPATIBLE_COMPILER)
1655 #if defined(YEP_X86_ABI) && defined(YEP_PIC)
1656 static YEP_INLINE void __cpuid(
int CPUInfo[4],
int InfoType) {
1657 CPUInfo[0] = InfoType;
1659 "movl %%ebx, %%edi;"
1661 "xchgl %%ebx, %%edi;"
1662 :
"+a" (CPUInfo[0]),
"=D" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3])
1668 static YEP_INLINE void __cpuidex(
int CPUInfo[4],
int InfoType,
int ECXValue) {
1669 CPUInfo[0] = InfoType;
1670 CPUInfo[2] = ECXValue;
1672 "movl %%ebx, %%edi;"
1674 "xchgl %%ebx, %%edi;"
1675 :
"+a" (CPUInfo[0]),
"=D" (CPUInfo[1]),
"+c" (CPUInfo[2]),
"=d" (CPUInfo[3])
1681 static YEP_INLINE void __cpuid(
int CPUInfo[4],
int InfoType) {
1682 CPUInfo[0] = InfoType;
1685 :
"+a" (CPUInfo[0]),
"=b" (CPUInfo[1]),
"=c" (CPUInfo[2]),
"=d" (CPUInfo[3])
1691 static YEP_INLINE void __cpuidex(
int CPUInfo[4],
int InfoType,
int ECXValue) {
1692 CPUInfo[0] = InfoType;
1693 CPUInfo[2] = ECXValue;
1696 :
"+a" (CPUInfo[0]),
"=b" (CPUInfo[1]),
"+c" (CPUInfo[2]),
"=d" (CPUInfo[3])
1703 #if !defined(YEP_INTEL_COMPILER) && !defined(YEP_K1OM_X64_ABI)
1708 :
"=a"(lo),
"=d"(hi)
1715 #elif defined(YEP_MSVC_COMPATIBLE_COMPILER)
1717 #if defined(YEP_MICROSOFT_COMPILER) && _MSC_FULL_VER < 150030729
1718 #pragma section(".text")
1720 #if defined(YEP_X86_CPU)
1723 __declspec(allocate(
".text")) static const
char __cpuidex_bytecode[] =
1724 "\x53\x56\x8B\x74\x24\x0C\x89\xD0\x0F\xA2\x89\x06\x89\x5E\x04\x89";
1729 __declspec(allocate(
".text")) static const
char __cpuidex_bytecode[] =
1730 "\x53\x89\xD0\x0F\xA2\x41\x89\x00\x41\x89\x58\x04\x41\x89\x48\x08\x41\x89\x50\x0C\x5B\xC3";
1734 typedef void(__fastcall *CpuidexPointer)(int, int,
int[4]);
1736 static YEP_INLINE void __cpuidex(
int CPUInfo[4],
int InfoType,
int ECXValue) {
1737 (CpuidexPointer(&__cpuidex_bytecode))(ECXValue, InfoType, CPUInfo);
1742 #if defined(YEP_MICROSOFT_COMPILER) && _MSC_FULL_VER < 160040219
1743 #pragma section(".text")
1745 #if defined(YEP_X86_CPU)
1748 __declspec(allocate(
".text")) static const
char _xgetbv_bytecode[] =
1753 __declspec(allocate(
".text")) static const
char _xgetbv_bytecode[] =
1754 "\x0F\x01\xD0\x48\xC1\xE2\x20\x48\x09\xD0\xC3";
1759 typedef void(__fastcall *CpuidexPointer)(int, int,
int[4]);
1762 return (XgetbvPointer(&_xgetbv_bytecode))(ext_ctrl_reg);
1764 #elif !defined(YEP_INTEL_COMPILER)
1766 extern "C" unsigned __int64 __cdecl _xgetbv(
unsigned int ext_ctrl_reg);
1767 #pragma intrinsic(_xgetbv)