Yeppp!
 All Data Structures Functions Variables Typedefs Enumerations Enumerator Groups Pages
yepBuiltin.h
1 /*
2  * Yeppp! library header
3  *
4  * This file is part of Yeppp! library and licensed under the New BSD license.
5  *
6  * Copyright (C) 2010-2012 Marat Dukhan
7  * Copyright (C) 2012-2013 Georgia Institute of Technology
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions are met:
12  * * Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  * * Redistributions in binary form must reproduce the above copyright
15  * notice, this list of conditions and the following disclaimer in the
16  * documentation and/or other materials provided with the distribution.
17  * * Neither the name of the Georgia Institute of Technology nor the
18  * names of its contributors may be used to endorse or promote products
19  * derived from this software without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24  * DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
25  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
28  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #pragma once
34 
35 #ifndef __cplusplus
36  #error "This intrinsics should only be used in C++ code"
37 #endif
38 
39 #include <yepPredefines.h>
40 #include <yepTypes.h>
41 #include <math.h>
42 
43 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
44  #include <intrin.h>
45 #endif
46 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X86_CPU)
47  #include <x86intrin.h>
48 #endif
49 
50 YEP_NATIVE_FUNCTION static YEP_INLINE void yepBuiltin_Break() {
51 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
52  __debugbreak();
53 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) || defined(YEP_NVIDIA_COMPILER)
54  __builtin_trap();
55 #else
56  #error "Unsupported compiler"
57 #endif
58 }
59 
60 YEP_NATIVE_FUNCTION static YEP_INLINE void yepBuiltin_AssumeUnreachable() {
61 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
62  __assume(0);
63 #elif defined(YEP_GNU_COMPILER)
64  /* Supported since in gcc 4.5 */
65  #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))
66  __builtin_unreachable();
67  #else
68  yepBuiltin_Break();
69  #endif
70 #elif defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
71  __builtin_unreachable();
72 #elif defined(YEP_NVIDIA_COMPILER)
73  yepBuiltin_Break();
74 #else
75  #error "Unsupported compiler"
76 #endif
77 }
78 
79 YEP_NATIVE_FUNCTION static YEP_INLINE YepSize yepBuiltin_GetPointerMisalignment(const void* pointer, YepSize alignment) {
80  return YepSize(pointer) % alignment;
81 }
82 
83 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_CombineParts_32u32u_64u(Yep32u hi, Yep32u lo) {
84  return (Yep64u(hi) << 32) | Yep64u(lo);
85 }
86 
87 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_GetHighPart_64u_32u(Yep64u n) {
88  return Yep32u(n >> 32);
89 }
90 
91 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_GetLowPart_64u_32u(Yep64u n) {
92  return Yep32u(n);
93 }
94 
95 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_Cast_64f_64u(Yep64f x) {
96 #if defined(YEP_NVIDIA_COMPILER)
97  return __double_as_longlong(x);
98 #elif defined(YEP_INTEL_COMPILER)
99  return _castf64_u64(x);
100 #else
101  union {
102  Yep64f float64;
103  Yep64u word64;
104  } float64_word64;
105  float64_word64.float64 = x;
106  return float64_word64.word64;
107 #endif
108 }
109 
110 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Cast_64u_64f(Yep64u x) {
111 #if defined(YEP_NVIDIA_COMPILER)
112  return __longlong_as_double(x);
113 #elif defined(YEP_INTEL_COMPILER)
114  return _castu64_f64(x);
115 #else
116  union {
117  Yep64f float64;
118  Yep64u word64;
119  } float64_word64;
120  float64_word64.word64 = x;
121  return float64_word64.float64;
122 #endif
123 }
124 
125 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Cast_32f_32u(Yep32f x) {
126 #if defined(YEP_NVIDIA_COMPILER)
127  return __float_as_int(x);
128 #elif defined(YEP_INTEL_COMPILER)
129  return _castf32_u32(x);
130 #else
131  union {
132  Yep32f float32;
133  Yep32u word32;
134  } float32_word32;
135  float32_word32.float32 = x;
136  return float32_word32.word32;
137 #endif
138 }
139 
140 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Cast_32u_32f(Yep32u x) {
141 #if defined(YEP_NVIDIA_COMPILER)
142  return __int_as_float(x);
143 #elif defined(YEP_INTEL_COMPILER)
144  return _castu32_f32(x);
145 #else
146  union {
147  Yep32f float32;
148  Yep32u word32;
149  } float32_word32;
150  float32_word32.word32 = x;
151  return float32_word32.float32;
152 #endif
153 }
154 
155 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_Map_64f_64u(Yep64f x) {
156  const Yep64u signMask = 0x8000000000000000ull;
157 
158  const Yep64u n = yepBuiltin_Cast_64f_64u(x);
159  const Yep64u mask = Yep64u(Yep64s(n) >> 62) >> 1;
160  return n ^ mask ^ signMask;
161 }
162 
163 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Map_64u_64f(Yep64u n) {
164  const Yep64u signMask = 0x8000000000000000ull;
165 
166  const Yep64u m = n ^ signMask;
167  const Yep64u mask = Yep64u(Yep64s(m) >> 62) >> 1;
168  return yepBuiltin_Cast_64u_64f(m ^ mask);
169 }
170 
171 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Map_32f_32u(Yep32f x) {
172  const Yep32u signMask = 0x80000000u;
173 
174  const Yep32u n = yepBuiltin_Cast_32f_32u(x);
175  const Yep32u mask = Yep32u(Yep32s(n) >> 30) >> 1;
176  return n ^ mask ^ signMask;
177 }
178 
179 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Map_32u_32f(Yep32u n) {
180  const Yep32u signMask = 0x80000000u;
181 
182  const Yep32u m = n ^ signMask;
183  const Yep32u mask = Yep32u(Yep32s(m) >> 30) >> 1;
184  return yepBuiltin_Cast_32u_32f(m ^ mask);
185 }
186 
187 YEP_NATIVE_FUNCTION static YEP_INLINE Yep16u yepBuiltin_ByteSwap_16u_16u(Yep16u n) {
188 #if defined(YEP_GNU_COMPILER) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || ((__GNUC__ == 4) && (__GNUC_MINOR__ == 7) && (__GNUC_PATCHLEVEL__ >= 3)))
189  return __builtin_bswap16(n);
190 #elif defined(YEP_CLANG_COMPILER) && ((__clang_major__ > 3) || ((__clang_major__ == 3) && (__clang_minor__ >= 2)))
191  return __builtin_bswap16(n);
192 #elif defined(YEP_INTEL_COMPILER)
193  return _rotwl(n, 8);
194 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
195  return Yep16u(__builtin_bswap32(n << 16));
196 #elif defined(YEP_MSVC_COMPATIBLE_COMPILER)
197  return _byteswap_ushort(n);
198 #else
199  return Yep16u(n >> 8) | Yep16u(n << 8);
200 #endif
201 }
202 
203 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_ByteSwap_32u_32u(Yep32u n) {
204 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_LINUX)
205  return __builtin_bswap32(n);
206 #elif defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)
207  return _byteswap_ulong(n);
208 #elif defined(YEP_NVIDIA_COMPILER)
209  return __byte_perm(n, n, 0x3210);
210 #else
211  return (n >> 24) | ((n >> 8) & 0x0000FF00u) | ((n << 8) & 0x00FF0000u) | (n << 24);
212 #endif
213 }
214 
215 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_ByteSwap_64u_64u(Yep64u n) {
216 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_LINUX)
217  return __builtin_bswap64(n);
218 #elif defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)
219  return _byteswap_uint64(n);
220 #else
221  const Yep32u nLo = yepBuiltin_GetLowPart_64u_32u(n);
222  const Yep32u nHi = yepBuiltin_GetHighPart_64u_32u(n);
223  const Yep32u nLoSwapped = yepBuiltin_ByteSwap_32u_32u(nLo);
224  const Yep32u nHiSwapped = yepBuiltin_ByteSwap_32u_32u(nHi);
225  return yepBuiltin_CombineParts_32u32u_64u(nLoSwapped, nHiSwapped);
226 #endif
227 }
228 
229 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Abs_32f_32f(Yep32f x) {
230 #if defined(YEP_MICROSOFT_COMPILER)
231  return abs(x);
232 #elif defined(YEP_INTEL_COMPILER)
233  return fabsf(x);
234 #elif defined(YEP_ARM_COMPILER)
235  return __fabsf(x);
236 #elif defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER)
237  return __builtin_fabsf(x);
238 #elif defined(YEP_NVIDIA_COMPILER)
239  return fabsf(x);
240 #else
241  return x >= 0.0f ? x : -x;
242 #endif
243 }
244 
245 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Abs_64f_64f(Yep64f x) {
246 #if defined(YEP_MICROSOFT_COMPILER)
247  return abs(x);
248 #elif defined(YEP_INTEL_COMPILER)
249  return fabs(x);
250 #elif defined(YEP_ARM_COMPILER)
251  return __fabs(x);
252 #elif defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER)
253  return __builtin_fabs(x);
254 #elif defined(YEP_NVIDIA_COMPILER)
255  return fabs(x);
256 #else
257  return x >= 0.0 ? x : -x;
258 #endif
259 }
260 
261 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_8s_32f(Yep8s number) {
262  return Yep32f(Yep32s(number));
263 }
264 
265 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_16s_32f(Yep16s number) {
266  return Yep32f(Yep32s(number));
267 }
268 
269 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_32s_32f(Yep32s number) {
270  return Yep32f(number);
271 }
272 
273 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_64s_32f(Yep64s number) {
274  return Yep32f(number);
275 }
276 
277 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_8s_64f(Yep8s number) {
278  return Yep64f(Yep32s(number));
279 }
280 
281 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_16s_64f(Yep16s number) {
282  return Yep64f(Yep32s(number));
283 }
284 
285 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_32s_64f(Yep32s number) {
286  return Yep64f(number);
287 }
288 
289 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_64s_64f(Yep64s number) {
290  return Yep64f(number);
291 }
292 
293 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_8u_32f(Yep8u number) {
294  return Yep32f(Yep32s(Yep32u(number)));
295 }
296 
297 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_16u_32f(Yep16u number) {
298  return Yep32f(Yep32s(Yep32u(number)));
299 }
300 
301 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_32u_32f(Yep32u number) {
302 #if defined(YEP_ARM_CPU)
303  return Yep32f(number);
304 #elif defined(YEP_MIPS_CPU)
305  return Yep32f(Yep32s(number & 0x7FFFFFFFu)) - Yep32f(Yep32s(number & 0x80000000u));
306 #else
307  return Yep32f(Yep64s(Yep64u(number)));
308 #endif
309 }
310 
311 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Convert_64u_32f(Yep64u number) {
312 #if defined(YEP_ARM_CPU)
313  return Yep32f(Yep64s(number & 0x7FFFFFFFFFFFFFFFull)) - Yep32f(Yep64s(number & 0x8000000000000000ull));
314 #else
315  return Yep32f(number);
316 #endif
317 }
318 
319 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_8u_64f(Yep8u number) {
320  return Yep64f(Yep32s(Yep32u(number)));
321 }
322 
323 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_16u_64f(Yep16u number) {
324  return Yep64f(Yep32s(Yep32u(number)));
325 }
326 
327 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_32u_64f(Yep32u number) {
328 #if defined(YEP_ARM_CPU)
329  return Yep64f(number);
330 #else
331  return Yep64f(Yep64s(Yep64u(number)));
332 #endif
333 }
334 
335 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Convert_64u_64f(Yep64u number) {
336 #if defined(YEP_ARM_CPU)
337  return Yep64f(number);
338 #else
339  return Yep64f(Yep64s(number & 0x7FFFFFFFFFFFFFFFull)) - Yep64f(Yep64s(number & 0x8000000000000000ull));
340 #endif
341 }
342 
343 YEP_NATIVE_FUNCTION static YEP_INLINE Yep8u yepBuiltin_Min_8u8u_8u(Yep8u a, Yep8u b) {
344  return (a > b) ? b : a;
345 }
346 
347 YEP_NATIVE_FUNCTION static YEP_INLINE Yep8s yepBuiltin_Min_8s8s_8s(Yep8s a, Yep8s b) {
348  return (a > b) ? b : a;
349 }
350 
351 YEP_NATIVE_FUNCTION static YEP_INLINE Yep16u yepBuiltin_Min_16u16u_16u(Yep16u a, Yep16u b) {
352  return (a > b) ? b : a;
353 }
354 
355 YEP_NATIVE_FUNCTION static YEP_INLINE Yep16s yepBuiltin_Min_16s16s_16s(Yep16s a, Yep16s b) {
356  return (a > b) ? b : a;
357 }
358 
359 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Min_32u32u_32u(Yep32u a, Yep32u b) {
360  return (a > b) ? b : a;
361 }
362 
363 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32s yepBuiltin_Min_32s32s_32s(Yep32s a, Yep32s b) {
364  return (a > b) ? b : a;
365 }
366 
367 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_Min_64u64u_64u(Yep64u a, Yep64u b) {
368  return (a > b) ? b : a;
369 }
370 
371 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64s yepBuiltin_Min_64s64s_64s(Yep64s a, Yep64s b) {
372  return (a > b) ? b : a;
373 }
374 
375 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Min_32f32f_32f(Yep32f a, Yep32f b) {
376 #if defined(YEP_NVIDIA_COMPILER)
377  return fminf(a, b);
378 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE_EXTENSION)
379  return _mm_cvtss_f32(_mm_min_ss(_mm_set_ss(a), _mm_set_ss(b)));
380 #elif defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FPU_INSTRUCTIONS)
381  if YEP_LIKELY(b == b) {
382  return (a < b) ? a : b;
383  } else {
384  return a;
385  }
386 #else
387  Yep32u au = yepBuiltin_Cast_32f_32u(a);
388  Yep32u bu = yepBuiltin_Cast_32f_32u(b);
389 
390  /* Check if b is NaN */
391  const Yep32u twoBu = bu + bu;
392  if YEP_UNLIKELY(twoBu > 0xFF000000u) {
393  /* b is NaN, return a */
394  bu = au;
395  }
396 
397  /* Check if a is NaN */
398  const Yep32u twoAu = au + au;
399  if YEP_UNLIKELY(twoAu > 0xFF000000u) {
400  /* a is NaN, return b */
401  au = bu;
402  }
403 
404  const Yep32s as = Yep32s(au) >= 0 ? au : 0x80000000 - au;
405  const Yep32s bs = Yep32s(bu) >= 0 ? bu : 0x80000000 - bu;
406  return as < bs ? yepBuiltin_Cast_32u_32f(au) : yepBuiltin_Cast_32u_32f(bu);
407 #endif
408 }
409 
410 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Min_64f64f_64f(Yep64f a, Yep64f b) {
411 #if defined(YEP_NVIDIA_COMPILER)
412  return fmin(a, b);
413 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE2_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE2_EXTENSION)
414  return _mm_cvtsd_f64(_mm_min_sd(_mm_set_sd(a), _mm_set_sd(b)));
415 #elif defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FPU_INSTRUCTIONS)
416  if YEP_LIKELY(b == b) {
417  return (a < b) ? a : b;
418  } else {
419  return a;
420  }
421 #else
422  Yep64u au = yepBuiltin_Cast_64f_64u(a);
423  Yep64u bu = yepBuiltin_Cast_64f_64u(b);
424 
425  /* Check if b is NaN */
426  const Yep64u negBu = bu | 0x8000000000000000ull;
427  if YEP_UNLIKELY(negBu > 0xFFF0000000000000ull) {
428  /* b is NaN, return a */
429  bu = au;
430  }
431 
432  /* Check if a is NaN */
433  const Yep64u negAu = au | 0x8000000000000000ull;
434  if YEP_UNLIKELY(negAu > 0xFFF0000000000000ull) {
435  /* a is NaN, return b */
436  au = bu;
437  }
438 
439  const Yep64s as = Yep64s(au) >= 0ll ? au : 0x8000000000000000ll - au;
440  const Yep64s bs = Yep64s(bu) >= 0ll ? bu : 0x8000000000000000ll - bu;
441  return as < bs ? yepBuiltin_Cast_64u_64f(au) : yepBuiltin_Cast_64u_64f(bu);
442 #endif
443 }
444 
445 YEP_NATIVE_FUNCTION static YEP_INLINE Yep8u yepBuiltin_Max_8u8u_8u(Yep8u a, Yep8u b) {
446  return (a > b) ? a : b;
447 }
448 
449 YEP_NATIVE_FUNCTION static YEP_INLINE Yep8s yepBuiltin_Max_8s8s_8s(Yep8s a, Yep8s b) {
450  return (a > b) ? a : b;
451 }
452 
453 YEP_NATIVE_FUNCTION static YEP_INLINE Yep16u yepBuiltin_Max_16u16u_16u(Yep16u a, Yep16u b) {
454  return (a > b) ? a : b;
455 }
456 
457 YEP_NATIVE_FUNCTION static YEP_INLINE Yep16s yepBuiltin_Max_16s16s_16s(Yep16s a, Yep16s b) {
458  return (a > b) ? a : b;
459 }
460 
461 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Max_32u32u_32u(Yep32u a, Yep32u b) {
462  return (a > b) ? a : b;
463 }
464 
465 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32s yepBuiltin_Max_32s32s_32s(Yep32s a, Yep32s b) {
466  return (a > b) ? a : b;
467 }
468 
469 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_Max_64u64u_64u(Yep64u a, Yep64u b) {
470  return (a > b) ? a : b;
471 }
472 
473 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64s yepBuiltin_Max_64s64s_64s(Yep64s a, Yep64s b) {
474  return (a > b) ? a : b;
475 }
476 
477 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Max_32f32f_32f(Yep32f a, Yep32f b) {
478 #if defined(YEP_NVIDIA_COMPILER)
479  return fmaxf(a, b);
480 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE_EXTENSION)
481  return _mm_cvtss_f32(_mm_max_ss(_mm_set_ss(a), _mm_set_ss(b)));
482 #elif defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FPU_INSTRUCTIONS)
483  if YEP_LIKELY(b == b) {
484  return (a > b) ? a : b;
485  } else {
486  return a;
487  }
488 #else
489  Yep32u au = yepBuiltin_Cast_32f_32u(a);
490  Yep32u bu = yepBuiltin_Cast_32f_32u(b);
491 
492  /* Check if b is NaN */
493  const Yep32u twoBu = bu + bu;
494  if YEP_UNLIKELY(twoBu > 0xFF000000u) {
495  /* b is NaN, return a */
496  bu = au;
497  }
498 
499  /* Check if a is NaN */
500  const Yep32u twoAu = au + au;
501  if YEP_UNLIKELY(twoAu > 0xFF000000u) {
502  /* a is NaN, return b */
503  au = bu;
504  }
505 
506  const Yep32s as = Yep32s(au) >= 0 ? au : 0x80000000 - au;
507  const Yep32s bs = Yep32s(bu) >= 0 ? bu : 0x80000000 - bu;
508  return as > bs ? yepBuiltin_Cast_32u_32f(au) : yepBuiltin_Cast_32u_32f(bu);
509 #endif
510 }
511 
512 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Max_64f64f_64f(Yep64f a, Yep64f b) {
513 #if defined(YEP_NVIDIA_COMPILER)
514  return fmax(a, b);
515 #elif defined(YEP_COMPILER_SUPPORTS_X86_SSE2_EXTENSION) && defined(YEP_PROCESSOR_SUPPORTS_X86_SSE2_EXTENSION)
516  return _mm_cvtsd_f64(_mm_max_sd(_mm_set_sd(a), _mm_set_sd(b)));
517 #elif defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FPU_INSTRUCTIONS)
518  if YEP_LIKELY(b == b) {
519  return (a > b) ? a : b;
520  } else {
521  return a;
522  }
523 #else
524  Yep64u au = yepBuiltin_Cast_64f_64u(a);
525  Yep64u bu = yepBuiltin_Cast_64f_64u(b);
526 
527  /* Check if b is NaN */
528  const Yep64u negBu = bu | 0x8000000000000000ull;
529  if YEP_UNLIKELY(negBu > 0xFFF0000000000000ull) {
530  /* b is NaN, return a */
531  bu = au;
532  }
533 
534  /* Check if a is NaN */
535  const Yep64u negAu = au | 0x8000000000000000ull;
536  if YEP_UNLIKELY(negAu > 0xFFF0000000000000ull) {
537  /* a is NaN, return b */
538  au = bu;
539  }
540 
541  const Yep64s as = Yep64s(au) >= 0ll ? au : 0x8000000000000000ll - au;
542  const Yep64s bs = Yep64s(bu) >= 0ll ? bu : 0x8000000000000000ll - bu;
543  return as > bs ? yepBuiltin_Cast_64u_64f(au) : yepBuiltin_Cast_64u_64f(bu);
544 #endif
545 }
546 
547 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Clamp_32u32u32u_32u(Yep32u x, Yep32u xMin, Yep32u xMax) {
548  return (x < xMin) ? xMin : (x > xMax) ? xMax : x;
549 }
550 
551 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32s yepBuiltin_Clamp_32s32s32s_32s(Yep32s x, Yep32s xMin, Yep32s xMax) {
552  return (x < xMin) ? xMin : (x > xMax) ? xMax : x;
553 }
554 
555 YEP_NATIVE_FUNCTION static YEP_INLINE YepBoolean yepBuiltin_IsNaN_64f(Yep64f n) {
556  return !(n == n);
557 }
558 
559 YEP_NATIVE_FUNCTION static YEP_INLINE YepBoolean yepBuiltin_IsNaN_32f(Yep32f n) {
560  return !(n == n);
561 }
562 
563 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_PositiveInfinity_32f() {
564 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
565  return __builtin_inff();
566 #else
567  const static Yep64f one = 1.0f;
568  const static Yep64f zero = 0.0f;
569  return one / zero;
570 #endif
571 }
572 
573 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_PositiveInfinity_64f() {
574 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
575  return __builtin_inf();
576 #else
577  const static Yep64f one = 1.0;
578  const static Yep64f zero = 0.0;
579  return one / zero;
580 #endif
581 }
582 
583 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_NegativeInfinity_32f() {
584  return -yepBuiltin_PositiveInfinity_32f();
585 }
586 
587 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_NegativeInfinity_64f() {
588  return -yepBuiltin_PositiveInfinity_64f();
589 }
590 
591 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_NaN_32f() {
592 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
593  return __builtin_nanf("");
594 #else
595  const static Yep32f zero = 0.0f;
596  return zero / zero;
597 #endif
598 }
599 
600 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_NaN_64f() {
601 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_NVIDIA_COMPILER)
602  return __builtin_nan("");
603 #else
604  const static Yep64f zero = 0.0;
605  return zero / zero;
606 #endif
607 }
608 
609 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Nlz_64u_32u(Yep64u x) {
610 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
611  return __builtin_clzl(x);
612 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)) && (defined(YEP_IA64_ABI) || defined(YEP_X64_ABI))
613  if (x == 0ull) {
614  return 64u;
615  } else {
616  unsigned long bitPosition;
617  _BitScanReverse64(&bitPosition, x);
618  return 63u - bitPosition;
619  }
620 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS)) && defined(YEP_X86_CPU)
621  const Yep32u xHi = yepBuiltin_GetHighPart_64u_32u(x);
622  const Yep32u xLo = yepBuiltin_GetLowPart_64u_32u(x);
623  unsigned long bitPositionHi, bitPositionLo;
624  _BitScanReverse(&bitPositionLo, xLo);
625  _BitScanReverse(&bitPositionHi, xHi);
626  if YEP_UNLIKELY(xHi == 0u) {
627  if YEP_UNLIKELY(xLo == 0u) {
628  return 64u;
629  } else {
630  return 63u - bitPositionLo;
631  }
632  } else {
633  return 31u - bitPositionHi;
634  }
635 #elif defined(YEP_NVIDIA_COMPILER)
636  return __clzll(x);
637 #else
638  #error "Compiler-specific implementation needed"
639 #endif
640 }
641 
642 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_Nlz_32u_32u(Yep64u x) {
643 #if defined(YEP_GNU_COMPILER) || defined(YEP_CLANG_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_UNIX)
644  return __builtin_clz(x);
645 #elif (defined(YEP_MICROSOFT_COMPILER) || defined(YEP_INTEL_COMPILER_FOR_WINDOWS))
646  if (x == 0ull) {
647  return 64u;
648  } else {
649  unsigned long bitPosition;
650  _BitScanReverse(&bitPosition, x);
651  return 63u - bitPosition;
652  }
653 #elif defined(YEP_NVIDIA_COMPILER)
654  return __clz(x);
655 #else
656  #error "Compiler-specific implementation needed"
657 #endif
658 }
659 
660 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Ulp_64f_64f(Yep64f x) {
661  const Yep64f absX = yepBuiltin_Abs_64f_64f(x);
662  if YEP_LIKELY(absX < yepBuiltin_PositiveInfinity_64f()) {
663  return yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(absX) + 1ull) - absX;
664  } else {
665  return x;
666  }
667 }
668 
669 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Ulp_32f_32f(Yep32f x) {
670  const Yep32f absX = yepBuiltin_Abs_32f_32f(x);
671  if YEP_LIKELY(absX < yepBuiltin_PositiveInfinity_32f()) {
672  return yepBuiltin_Cast_32u_32f(yepBuiltin_Cast_32f_32u(absX) + 1u) - absX;
673  } else {
674  return x;
675  }
676 }
677 
678 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
679  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FMA_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
680  return __builtin_fma(a, b, c);
681  }
682 
683  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FMS_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
684  return __builtin_fma(a, b, -c);
685  }
686 
687  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FNMA_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
688  return __builtin_fma(-a, b, c);
689  }
690 
691  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FNMS_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
692  return __builtin_fma(-a, b, -c);
693  }
694 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
695  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FMA_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
696  return __fma_rn(a, b, c);
697  }
698 
699  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FMS_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
700  return __fma_rn(a, b, -c);
701  }
702 
703  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FNMA_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
704  return __fma_rn(-a, b, c);
705  }
706 
707  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_FNMS_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
708  return __fma_rn(-a, b, -c);
709  }
710 #endif
711 
712 #if defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
713  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FMA_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
714  return __builtin_fmaf(a, b, c);
715  }
716 
717  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FMS_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
718  return __builtin_fmaf(a, b, -c);
719  }
720 
721  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FNMA_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
722  return __builtin_fmaf(-a, b, c);
723  }
724 
725  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FNMS_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
726  return __builtin_fmaf(-a, b, -c);
727  }
728 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
729  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FMA_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
730  return __fmaf_rn(a, b, c);
731  }
732 
733  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FMS_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
734  return __fmaf_rn(a, b, -c);
735  }
736 
737  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FNMA_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
738  return __fmaf_rn(-a, b, c);
739  }
740 
741  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_FNMS_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
742  return __fmaf_rn(-a, b, -c);
743  }
744 #endif
745 
746 #if defined(YEP_PROCESSOR_SUPPORTS_DOUBLE_PRECISION_FMA_INSTRUCTIONS)
747  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Divide_64f64f64f_64f(Yep64f y, Yep64f c, Yep64f rcpC) {
748  const Yep64f q = y * rcpC;
749  const Yep64f r = yepBuiltin_FNMA_64f64f64f_64f(c, q, y);
750  return yepBuiltin_FMA_64f64f64f_64f(r, rcpC, q);
751  }
752 
753  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_MultiplyAdd_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
754  return yepBuiltin_FMA_64f64f64f_64f(a, b, c);
755  }
756 
757  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_MultiplySubtract_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
758  return yepBuiltin_FMS_64f64f64f_64f(a, b, c);
759  }
760 #else
761  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Divide_64f64f64f_64f(Yep64f y, Yep64f c, Yep64f rcpC) {
762  return y / c;
763  }
764 
765  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_MultiplyAdd_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
766  return a * b + c;
767  }
768 
769  YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_MultiplySubtract_64f64f64f_64f(Yep64f a, Yep64f b, Yep64f c) {
770  return a * b - c;
771  }
772 #endif
773 
774 #if defined(YEP_PROCESSOR_SUPPORTS_SINGLE_PRECISION_FMA_INSTRUCTIONS)
775  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Divide_32f32f32f_32f(Yep32f y, Yep32f c, Yep32f rcpC) {
776  const Yep32f q = y * rcpC;
777  const Yep32f r = yepBuiltin_FNMA_32f32f32f_32f(c, q, y);
778  return yepBuiltin_FMA_32f32f32f_32f(r, rcpC, q);
779  }
780 
781  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_MultiplyAdd_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
782  return yepBuiltin_FMA_32f32f32f_32f(a, b, c);
783  }
784 
785  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_MultiplySubtract_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
786  return yepBuiltin_FMS_32f32f32f_32f(a, b, c);
787  }
788 #else
789  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Divide_32f32f32f_32f(Yep32f y, Yep32f c, Yep32f rcpC) {
790  return y / c;
791  }
792 
793  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_MultiplyAdd_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
794  return a * b + c;
795  }
796 
797  YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_MultiplySubtract_32f32f32f_32f(Yep32f a, Yep32f b, Yep32f c) {
798  return a * b - c;
799  }
800 #endif
801 
802 
803 /* See algorithm 4.3 in "Handbook of floating-point arithmetic" */
804 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64df yepBuiltin_Add_64f64f_64df_AlgFast(Yep64f a, Yep64f b) {
805 #if defined(YEP_NVIDIA_COMPILER)
806  Yep64df sum;
807  sum.high = __dadd_rn(a, b);
808  const Yep64f bCorrected = __dadd_rn(sum.high, -a);
809  const Yep64f deltaB = __dadd_rn(b, -bCorrected);
810  sum.low = deltaB;
811  return sum;
812 #else
813  Yep64df sum;
814  sum.high = a + b;
815  const Yep64f bCorrected = sum.high - a;
816  const Yep64f deltaB = b - bCorrected;
817  sum.low = deltaB;
818  return sum;
819 #endif
820 }
821 
822 /* See algorithm 4.4 in "Handbook of floating-point arithmetic" */
823 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64df yepBuiltin_Add_64f64f_64df(Yep64f a, Yep64f b) {
824 #if defined(YEP_NVIDIA_COMPILER)
825  Yep64df sum;
826  sum.high = __dadd_rn(a, b);
827  const Yep64f aCorrected = __dadd_rn(sum.high, -b);
828  const Yep64f bCorrected = __dadd_rn(sum.high, -aCorrected);
829  const Yep64f deltaA = __dadd_rn(a, -aCorrected);
830  const Yep64f deltaB = __dadd_rn(b, -bCorrected);
831  sum.low = __dadd_rn(deltaA, deltaB);
832  return sum;
833 #else
834  Yep64df sum;
835  sum.high = a + b;
836  const Yep64f aCorrected = sum.high - b;
837  const Yep64f bCorrected = sum.high - aCorrected;
838  const Yep64f deltaA = a - aCorrected;
839  const Yep64f deltaB = b - bCorrected;
840  sum.low = deltaA + deltaB;
841  return sum;
842 #endif
843 }
844 
845 
846 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64df yepBuiltin_Multiply_64f64f_64df(Yep64f a, Yep64f b) {
847  Yep64df product;
848  product.high = a * b;
849 #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
850  product.low = yepBuiltin_FMS_64f64f64f_64f(a, b, product.high);
851 #else
852  Yep64df da, db;
853  /* Zeroes out 27 least significant bits */
854  const Yep64u mask = 0xFFFFFFFFF8000000ull;
855  da.high = yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(a) & mask);
856  da.low = a - da.high;
857  db.high = yepBuiltin_Cast_64u_64f(yepBuiltin_Cast_64f_64u(b) & mask);
858  db.low = b - da.high;
859  const Yep64f t1 = -product.high + Yep64f(da.high * db.high);
860  const Yep64f t2 = t1 + Yep64f(da.high * db.low);
861  const Yep64f t3 = t2 + Yep64f(da.low * db.high);
862  product.low = t3 + Yep64f(da.low * db.low);
863 #endif
864  return product;
865 }
866 
867 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Log_32f_32f(Yep32f x) {
868  const Yep32u defaultExponent = 0x3F800000u;
869 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
870  const Yep32f sqrt2 = 0x1.6A09E6p+0f;
871  const Yep32f c2 = -0x1.FFFFF2p-2f;
872  const Yep32f c3 = 0x1.55571Ep-2f;
873  const Yep32f c4 = -0x1.0006B2p-2f;
874  const Yep32f c5 = 0x1.98CB06p-3f;
875  const Yep32f c6 = -0x1.530B6Ap-3f;
876  const Yep32f c7 = 0x1.317FD6p-3f;
877  const Yep32f c8 = -0x1.26F724p-3f;
878  const Yep32f c9 = 0x1.6A66D0p-4f;
879 
880  const Yep32f ln2_hi = 0x1.62E400p-1f; /* The lowest 7 bits are zeros */
881  const Yep32f ln2_lo = 0x1.7F7D1Cp-20f;
882 #else
883  const Yep32f sqrt2 = 1.41421353816986083984375f;
884  const Yep32f c2 = -0.4999997913837432861328125f;
885  const Yep32f c3 = 0.3333401381969451904296875f;
886  const Yep32f c4 = -0.2500255405902862548828125f;
887  const Yep32f c5 = 0.19960598647594451904296875f;
888  const Yep32f c6 = -0.16554911434650421142578125f;
889  const Yep32f c7 = 0.14916960895061492919921875f;
890  const Yep32f c8 = -0.1440260708332061767578125f;
891  const Yep32f c9 = 8.8476955890655517578125e-2f;
892 
893  const Yep32f ln2_hi = 0.693145751953125f; /* The lowest 7 bits are zeros */
894  const Yep32f ln2_lo = 1.428606765330187045037746429443359375e-6f;
895 #endif
896  if YEP_UNLIKELY(yepBuiltin_IsNaN_32f(x)) {
897  return x;
898  } else {
899  const Yep32s xWord = yepBuiltin_Cast_32f_32u(x);
900  if YEP_UNLIKELY(xWord < 0) {
901  // sign(x) == -1
902  return yepBuiltin_NaN_32f();
903  } else if YEP_UNLIKELY(xWord == 0) {
904  // x == +0.0
905  return yepBuiltin_NegativeInfinity_32f();
906  } else if YEP_UNLIKELY(xWord == 0x7F800000) {
907  // x == +inf
908  return x;
909  }
910  Yep32s exponent;
911  Yep32u mantissa;
912  if YEP_LIKELY(xWord >= 0x00800000u) {
913  // Normalized number
914  exponent = Yep32s(Yep32u(xWord) >> 23) - 127;
915  mantissa = xWord & 0x007FFFFFu;
916  } else {
917  // Denormalized number
918  const Yep32u pointOffset = yepBuiltin_Nlz_32u_32u(xWord) - 7u;
919  exponent = -126 - Yep32s(pointOffset);
920  mantissa = (xWord << pointOffset) & 0x007FFFFFu;
921  }
922  x = yepBuiltin_Cast_32u_32f(defaultExponent | mantissa);
923  if (x >= sqrt2) {
924  exponent += 1;
925  x = x * 0.5f;
926  }
927  const Yep32f t = x - 1.0f;
928  const Yep32f dexp = Yep32f(exponent);
929  return (t + t * (t * (c2 + t * (c3 + t * (c4 + t * (c5 + t * (c6 + t * (c7 + t * (c8 + t * c9)))))))) +
930  dexp * ln2_lo) + dexp * ln2_hi;
931  }
932 }
933 
934 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Log_64f_64f(Yep64f x) {
935  const Yep64u defaultExponent = 0x3FF0000000000000ull;
936 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
937  const Yep64f sqrt2 = 0x1.6A09E667F3BCDp+0;
938  const Yep64f c2 = -0x1.FFFFFFFFFFFF2p-2;
939  const Yep64f c3 = 0x1.5555555555103p-2;
940  const Yep64f c4 = -0x1.00000000013C7p-2;
941  const Yep64f c5 = 0x1.9999999A43E4Fp-3;
942  const Yep64f c6 = -0x1.55555554A6A2Bp-3;
943  const Yep64f c7 = 0x1.249248DAE4B2Ap-3;
944  const Yep64f c8 = -0x1.FFFFFFBD8606Dp-4;
945  const Yep64f c9 = 0x1.C71C90DB06248p-4;
946  const Yep64f c10 = -0x1.9999C5BE751E3p-4;
947  const Yep64f c11 = 0x1.745980F3FB889p-4;
948  const Yep64f c12 = -0x1.554D5ACD502ABp-4;
949  const Yep64f c13 = 0x1.3B4ED39194B87p-4;
950  const Yep64f c14 = -0x1.25480A82633AFp-4;
951  const Yep64f c15 = 0x1.0F23916A44515p-4;
952  const Yep64f c16 = -0x1.EED2E2BB64B2Ep-5;
953  const Yep64f c17 = 0x1.EA17E14773369p-5;
954  const Yep64f c18 = -0x1.1654764F478ECp-4;
955  const Yep64f c19 = 0x1.0266CD08DB2F2p-4;
956  const Yep64f c20 = -0x1.CC4EC078138E3p-6;
957 
958  #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
959  const Yep64df ln2 = { 0x1.62E42FEFA39EFp-1, 0x1.ABC9E3B39803Fp-56 };
960  #else
961  const Yep64df ln2 = { 0x1.62E42FEFA3800p-1, 0x1.EF35793C76730p-45 };
962  #endif
963 #else
964  const Yep64f sqrt2 = 1.4142135623730951;
965  const Yep64f c2 = -0.4999999999999992;
966  const Yep64f c3 = 0.3333333333332719;
967  const Yep64f c4 = -0.25000000000028105;
968  const Yep64f c5 = 0.20000000001936022;
969  const Yep64f c6 = -0.16666666664680582;
970  const Yep64f c7 = 0.14285714071282857;
971  const Yep64f c8 = -0.12499999903264021;
972  const Yep64f c9 = 0.11111122688488095;
973  const Yep64f c10 = -0.10000016444912023;
974  const Yep64f c11 = 0.09090566990173178;
975  const Yep64f c12 = -0.08332572431118972;
976  const Yep64f c13 = 0.07697947162641415;
977  const Yep64f c14 = -0.0716019068260738;
978  const Yep64f c15 = 0.06619602968955392;
979  const Yep64f c16 = -0.060403292499492486;
980  const Yep64f c17 = 0.059825839994664794;
981  const Yep64f c18 = -0.06795164313050223;
982  const Yep64f c19 = 0.06308631984365912;
983  const Yep64f c20 = -0.028094947774939604;
984 
985  #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
986  const Yep64df ln2 = { 0.6931471805599453, 2.3190468138462996e-17 };
987  #else
988  const Yep64df ln2 = { 0.6931471805598903, 5.497923018708371e-14 };
989  #endif
990 #endif
991  Yep32s exponent;
992  Yep64u mantissa;
993  const Yep64s xWord = yepBuiltin_Cast_64f_64u(x);
994  if YEP_LIKELY(xWord >= 0x0010000000000000ull) {
995  /* Normalized number */
996  exponent = Yep32s((yepBuiltin_GetHighPart_64u_32u(Yep64u(xWord)) >> 20) & 0x7FFu) - 1023;
997  mantissa = xWord & 0x000FFFFFFFFFFFFFull;
998  } else {
999  /* Denormalized number */
1000  const Yep32u pointOffset = yepBuiltin_Nlz_64u_32u(xWord) - 11u;
1001  exponent = -1022 - Yep32s(pointOffset);
1002  mantissa = (xWord << pointOffset) & 0x000FFFFFFFFFFFFFull;
1003  }
1004  x = yepBuiltin_Cast_64u_64f(defaultExponent | mantissa);
1005  if (x >= sqrt2) {
1006  exponent += 1;
1007  x = x * 0.5;
1008  }
1009  const Yep64f t = x - 1.0;
1010  const Yep64f dexp = yepBuiltin_Convert_32s_64f(exponent);
1011  const Yep64f pt = yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1012  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1013  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1014  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1015  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1016  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1017  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1018  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1019  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1020  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1021  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1022  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1023  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1024  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1025  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1026  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1027  yepBuiltin_MultiplyAdd_64f64f64f_64f(t,
1028  yepBuiltin_MultiplyAdd_64f64f64f_64f(t, c20, c19),
1029  c18),
1030  c17),
1031  c16),
1032  c15),
1033  c14),
1034  c13),
1035  c12),
1036  c11),
1037  c10),
1038  c9),
1039  c8),
1040  c7),
1041  c6),
1042  c5),
1043  c4),
1044  c3),
1045  c2);
1046  const Yep64f rf = yepBuiltin_MultiplyAdd_64f64f64f_64f(t, t * pt, t);
1047  Yep64f f = yepBuiltin_MultiplyAdd_64f64f64f_64f(dexp, ln2.high, yepBuiltin_MultiplyAdd_64f64f64f_64f(dexp, ln2.low, rf));
1048  if YEP_UNLIKELY(xWord < 0ll) {
1049  /* Fixup negative inputs */
1050  f = yepBuiltin_NaN_64f();
1051  } else if YEP_UNLIKELY(xWord == 0ll) {
1052  /* Fixup +0.0 */
1053  f = yepBuiltin_NegativeInfinity_64f();
1054  } else if YEP_UNLIKELY(!(x < yepBuiltin_PositiveInfinity_64f())) {
1055  /* Fixup +inf and NaN */
1056  f = x;
1057  }
1058  return f;
1059 }
1060 
1061 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Exp_32f_32f(Yep32f x) {
1062 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1063  const Yep32f magicBias = 0x1.000000p+23f;
1064  const Yep32f zeroCutoff = -0x1.9FE368p+6f; /* The smallest x for which expf(x) is non-zero */
1065  const Yep32f infCutoff = 0x1.62E42Ep+6f; /* The largest x for which expf(x) is finite */
1066  const Yep32f log2e = 0x1.715476p+0f;
1067  const Yep32f ln2_hi = 0x1.62E400p-1f; /* The lowest 7 bits are zeros */
1068  const Yep32f ln2_lo = 0x1.7F7D1Cp-20f;
1069 
1070  const Yep32f c2 = 0x1.FFFFFCp-2f;
1071  const Yep32f c3 = 0x1.55548Cp-3f;
1072  const Yep32f c4 = 0x1.555834p-5f;
1073  const Yep32f c5 = 0x1.123CFEp-7f;
1074  const Yep32f c6 = 0x1.6ADCAEp-10f;
1075 #else
1076  const Yep32f magicBias = 8388608.0f;
1077  const Yep32f zeroCutoff = -1.03972076416015625e+2f; /* The smallest x for which expf(x) is non-zero */
1078  const Yep32f infCutoff = 8.872283172607421875e+1f; /* The largest x for which expf(x) is finite */
1079  const Yep32f log2e = 1.44269502162933349609375f;
1080  const Yep32f ln2_hi = 0.693145751953125f; /* The lowest 7 bits are zeros */
1081  const Yep32f ln2_lo = 1.428606765330187045037746429443359375e-6f;
1082 
1083  const Yep32f c2 = 0.499999940395355224609375f;
1084  const Yep32f c3 = 0.1666651666164398193359375f;
1085  const Yep32f c4 = 4.1668035089969635009765625e-2f;
1086  const Yep32f c5 = 8.369087241590023040771484375e-3f;
1087  const Yep32f c6 = 1.384208793751895427703857421875e-3f;
1088 #endif
1089 
1090  if YEP_UNLIKELY(yepBuiltin_IsNaN_32f(x)) {
1091  return x;
1092  } else {
1093  Yep32f t = x * log2e + magicBias;
1094  Yep32u e1 = yepBuiltin_Cast_32f_32u(t) << 23;
1095  Yep32u e2 = e1;
1096  e1 = yepBuiltin_Clamp_32s32s32s_32s(e1, -126 << 23, 127 << 23);
1097  e2 -= e1;
1098  const Yep32f s1 = yepBuiltin_Cast_32u_32f(e1 + 0x3F800000u);
1099  const Yep32f s2 = yepBuiltin_Cast_32u_32f(e2 + 0x3F800000u);
1100  t -= magicBias;
1101  const Yep32f rx = (x - t * ln2_hi) - t * ln2_lo;
1102  const Yep32f rf = rx + rx * rx * (c2 + rx * (c3 + rx * (c4 + rx * (c5 + rx * c6))));
1103  Yep32f f = s2 * (s1 * rf + s1);
1104  if YEP_UNLIKELY(x > infCutoff) {
1105  f = yepBuiltin_PositiveInfinity_32f();
1106  }
1107  if YEP_UNLIKELY(x < zeroCutoff) {
1108  f = 0.0f;
1109  }
1110  return f;
1111  }
1112 }
1113 
1114 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Exp_64f_64f(Yep64f x) {
1115 
1116 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1117  const Yep64f magicBias = 0x1.8000000000000p+52;
1118  const Yep64f log2e = 0x1.71547652B82FEp+0;
1119  #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
1120  const Yep64df ln2 = { 0x1.62E42FEFA39EFp-1, 0x1.ABC9E3B39803Fp-56 };
1121  #else
1122  const Yep64df ln2 = { 0x1.62E42FEFA3800p-1, 0x1.EF35793C76730p-45 };
1123  #endif
1124  const Yep64f c2 = 0x1.0000000000005p-1;
1125  const Yep64f c3 = 0x1.5555555555540p-3;
1126  const Yep64f c4 = 0x1.5555555552115p-5;
1127  const Yep64f c5 = 0x1.11111111173CAp-7;
1128  const Yep64f c6 = 0x1.6C16C17F2BF99p-10;
1129  const Yep64f c7 = 0x1.A01A017EEB164p-13;
1130  const Yep64f c8 = 0x1.A019A6AC02A7Dp-16;
1131  const Yep64f c9 = 0x1.71DE71651CE7Ap-19;
1132  const Yep64f c10 = 0x1.28A284098D813p-22;
1133  const Yep64f c11 = 0x1.AE9043CA87A40p-26;
1134 
1135  const Yep64f zeroCutoff = -0x1.74910D52D3051p+9;
1136  const Yep64f infCutoff = 0x1.62E42FEFA39EFp+9;
1137 #else
1138  const Yep64f magicBias = 6755399441055744.0;
1139  const Yep64f log2e = 1.4426950408889634;
1140  #if defined(YEP_PROCESSOR_SUPPORTS_FMA_EXTENSION)
1141  const Yep64df ln2 = { 0.6931471805599453, 2.3190468138462996e-17 };
1142  #else
1143  const Yep64df ln2 = { 0.6931471805598903, 5.497923018708371e-14 };
1144  #endif
1145  const Yep64f c2 = 0.5000000000000006;
1146  const Yep64f c3 = 0.16666666666666607;
1147  const Yep64f c4 = 0.04166666666657385;
1148  const Yep64f c5 = 0.008333333333377175;
1149  const Yep64f c6 = 0.0013888888932278352;
1150  const Yep64f c7 = 0.0001984126974695729;
1151  const Yep64f c8 = 2.4801504579877947e-5;
1152  const Yep64f c9 = 2.755738182142102e-6;
1153  const Yep64f c10 = 2.762627110160372e-7;
1154  const Yep64f c11 = 2.5062096212675488e-8;
1155 
1156  const Yep64f zeroCutoff = -745.1332191019411;
1157  const Yep64f infCutoff = 709.7827128933840;
1158 #endif
1159 
1160  if YEP_UNLIKELY(yepBuiltin_IsNaN_64f(x)) {
1161  return x;
1162  } else {
1163  Yep64f t = x * log2e + magicBias;
1164  Yep32u e1 = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t)) << 20;
1165  Yep32u e2 = e1;
1166  e1 = yepBuiltin_Clamp_32s32s32s_32s(e1, -1022 << 20, 1023 << 20);
1167  e2 -= e1;
1168  const Yep64f s1 = yepBuiltin_Cast_64u_64f(yepBuiltin_CombineParts_32u32u_64u(e1 + 0x3FF00000u, 0u));
1169  const Yep64f s2 = yepBuiltin_Cast_64u_64f(yepBuiltin_CombineParts_32u32u_64u(e2 + 0x3FF00000u, 0u));
1170  t -= magicBias;
1171  const Yep64f rx = (x - t * ln2.high) - t * ln2.low;
1172  const Yep64f px = yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1173  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1174  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1175  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1176  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1177  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1178  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1179  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx,
1180  yepBuiltin_MultiplyAdd_64f64f64f_64f(rx, c11, c10),
1181  c9),
1182  c8),
1183  c7),
1184  c6),
1185  c5),
1186  c4),
1187  c3),
1188  c2);
1189  const Yep64f rf = yepBuiltin_MultiplyAdd_64f64f64f_64f(rx, rx * px, rx);
1190  Yep64f f = s2 * yepBuiltin_MultiplyAdd_64f64f64f_64f(s1, rf, s1);
1191  if YEP_UNLIKELY(x > infCutoff) {
1192  f = yepBuiltin_PositiveInfinity_64f();
1193  }
1194  if YEP_UNLIKELY(x < zeroCutoff) {
1195  f = 0.0;
1196  }
1197  return f;
1198  }
1199 }
1200 
1201 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Sin_64f_64f(Yep64f x) {
1202 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1203  const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1204  const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1205  const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1206  const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1207  const Yep64f magicBias = 0x1.8000000000000p+52;
1208 
1209  const Yep64f c0 = 0x1.0000000000000p+0;
1210  const Yep64f c2 = -0x1.0000000000000p-1;
1211  const Yep64f c3 = -0x1.5555555555546p-3;
1212  const Yep64f c4 = 0x1.555555555554Bp-5;
1213  const Yep64f c5 = 0x1.111111110F51Ep-7;
1214  const Yep64f c6 = -0x1.6C16C16C15038p-10;
1215  const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1216  const Yep64f c8 = 0x1.A01A019C94874p-16;
1217  const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1218  const Yep64f c10 = -0x1.27E4F7F65104Fp-22;
1219  const Yep64f c11 = -0x1.AE5E38936D046p-26;
1220  const Yep64f c12 = 0x1.1EE9DF6693F7Ep-29;
1221  const Yep64f c13 = 0x1.5D8711D281543p-33;
1222  const Yep64f c14 = -0x1.8FA87EF79AE3Fp-37;
1223 #else
1224  const Yep64f minusPio2_hi = -1.5707963267923333;
1225  const Yep64f minusPio2_me = -2.5633441515971907e-12;
1226  const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1227  const Yep64f twoOPi = 0.6366197723675814;
1228  const Yep64f magicBias = 6755399441055744.0;
1229 
1230  const Yep64f c0 = 1.0;
1231  const Yep64f c2 = -0.5;
1232  const Yep64f c3 = -0.16666666666666624;
1233  const Yep64f c4 = 0.041666666666666595;
1234  const Yep64f c5 = 0.008333333333320921;
1235  const Yep64f c6 = -0.0013888888888873418;
1236  const Yep64f c7 = -0.0001984126982882608;
1237  const Yep64f c8 = 2.480158728907678e-05;
1238  const Yep64f c9 = 2.755731339913502e-06;
1239  const Yep64f c10 = -2.755731424340092e-07;
1240  const Yep64f c11 = -2.505071756776031e-08;
1241  const Yep64f c12 = 2.0875709384133097e-09;
1242  const Yep64f c13 = 1.58946757299646e-10;
1243  const Yep64f c14 = -1.135896887279365e-11;
1244 #endif
1245  Yep64f t = x * twoOPi + magicBias;
1246  const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1247  t -= magicBias;
1248  x += t * minusPio2_hi;
1249  const Yep64f a = x;
1250  const Yep64f midProduct = t * minusPio2_me;
1251  x += midProduct;
1252  const Yep64f r = midProduct - (x - a);
1253  x += (t * minusPio2_lo + r);
1254 
1255  const Yep64f sqrX = x * x;
1256  Yep64f sinX = c13;
1257  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1258  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1259  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1260  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1261  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1262  sinX = sinX * sqrX;
1263  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1264  Yep64f cosX = c14;
1265  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c12);
1266  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c10);
1267  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c8);
1268  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c6);
1269  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c4);
1270  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c2);
1271  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c0);
1272 
1273  const Yep64f f = (n & 1) ? cosX : sinX;
1274  return (n & 2) ? -f : f;
1275 }
1276 
1277 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Cos_64f_64f(Yep64f x) {
1278 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1279  const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1280  const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1281  const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1282  const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1283  const Yep64f magicBias = 0x1.8000000000000p+52;
1284 
1285  const Yep64f c0 = -0x1.0000000000000p+0;
1286  const Yep64f c2 = 0x1.0000000000000p-1;
1287  const Yep64f c3 = -0x1.5555555555546p-3;
1288  const Yep64f c4 = -0x1.555555555554Bp-5;
1289  const Yep64f c5 = 0x1.111111110F51Ep-7;
1290  const Yep64f c6 = 0x1.6C16C16C15038p-10;
1291  const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1292  const Yep64f c8 = -0x1.A01A019C94874p-16;
1293  const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1294  const Yep64f c10 = 0x1.27E4F7F65104Fp-22;
1295  const Yep64f c11 = -0x1.AE5E38936D046p-26;
1296  const Yep64f c12 = -0x1.1EE9DF6693F7Ep-29;
1297  const Yep64f c13 = 0x1.5D8711D281543p-33;
1298  const Yep64f c14 = 0x1.8FA87EF79AE3Fp-37;
1299 #else
1300  const Yep64f minusPio2_hi = -1.5707963267923333;
1301  const Yep64f minusPio2_me = -2.5633441515971907e-12;
1302  const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1303  const Yep64f twoOPi = 0.6366197723675814;
1304  const Yep64f magicBias = 6755399441055744.0;
1305 
1306  const Yep64f c0 = -1.0;
1307  const Yep64f c2 = 0.5;
1308  const Yep64f c3 = -0.16666666666666624;
1309  const Yep64f c4 = -0.041666666666666595;
1310  const Yep64f c5 = 0.008333333333320921;
1311  const Yep64f c6 = 0.0013888888888873418;
1312  const Yep64f c7 = -0.0001984126982882608;
1313  const Yep64f c8 = -2.480158728907678e-05;
1314  const Yep64f c9 = 2.755731339913502e-06;
1315  const Yep64f c10 = 2.755731424340092e-07;
1316  const Yep64f c11 = -2.505071756776031e-08;
1317  const Yep64f c12 = -2.0875709384133097e-09;
1318  const Yep64f c13 = 1.58946757299646e-10;
1319  const Yep64f c14 = 1.135896887279365e-11;
1320 #endif
1321  Yep64f t = x * twoOPi + magicBias;
1322  const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1323  t -= magicBias;
1324  x += t * minusPio2_hi;
1325  const Yep64f a = x;
1326  const Yep64f midProduct = t * minusPio2_me;
1327  x += midProduct;
1328  const Yep64f r = midProduct - (x - a);
1329  x += (t * minusPio2_lo + r);
1330 
1331  const Yep64f sqrX = x * x;
1332  Yep64f sinX = c13;
1333  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1334  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1335  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1336  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1337  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1338  sinX = sinX * sqrX;
1339  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1340  Yep64f minusCosX = c14;
1341  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c12);
1342  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c10);
1343  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c8);
1344  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c6);
1345  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c4);
1346  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c2);
1347  minusCosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(minusCosX, sqrX, c0);
1348 
1349  const Yep64f f = (n & 1) ? sinX : minusCosX;
1350  return (n & 2) ? f : -f;
1351 }
1352 
1353 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Tan_64f_64f(Yep64f x) {
1354 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1355  const Yep64f minusPio2_hi = -0x1.921FB54440000p+0;
1356  const Yep64f minusPio2_me = -0x1.68C234C4C8000p-39;
1357  const Yep64f minusPio2_lo = 0x1.9D747F23E32EDp-79;
1358  const Yep64f twoOPi = 0x1.45F306DC9C883p-1;
1359  const Yep64f magicBias = 0x1.8000000000000p+52;
1360 
1361  const Yep64f c0 = 0x1.0000000000000p+0;
1362  const Yep64f c2 = -0x1.0000000000000p-1;
1363  const Yep64f c3 = -0x1.5555555555546p-3;
1364  const Yep64f c4 = 0x1.555555555554Bp-5;
1365  const Yep64f c5 = 0x1.111111110F51Ep-7;
1366  const Yep64f c6 = -0x1.6C16C16C15038p-10;
1367  const Yep64f c7 = -0x1.A01A019BB92C0p-13;
1368  const Yep64f c8 = 0x1.A01A019C94874p-16;
1369  const Yep64f c9 = 0x1.71DE3535C8A8Ap-19;
1370  const Yep64f c10 = -0x1.27E4F7F65104Fp-22;
1371  const Yep64f c11 = -0x1.AE5E38936D046p-26;
1372  const Yep64f c12 = 0x1.1EE9DF6693F7Ep-29;
1373  const Yep64f c13 = 0x1.5D8711D281543p-33;
1374  const Yep64f c14 = -0x1.8FA87EF79AE3Fp-37;
1375 #else
1376  const Yep64f minusPio2_hi = -1.5707963267923333;
1377  const Yep64f minusPio2_me = -2.5633441515971907e-12;
1378  const Yep64f minusPio2_lo = 2.6718907338610155e-24;
1379  const Yep64f twoOPi = 0.6366197723675814;
1380  const Yep64f magicBias = 6755399441055744.0;
1381 
1382  const Yep64f c0 = 1.0;
1383  const Yep64f c2 = -0.5;
1384  const Yep64f c3 = -0.16666666666666624;
1385  const Yep64f c4 = 0.041666666666666595;
1386  const Yep64f c5 = 0.008333333333320921;
1387  const Yep64f c6 = -0.0013888888888873418;
1388  const Yep64f c7 = -0.0001984126982882608;
1389  const Yep64f c8 = 2.480158728907678e-05;
1390  const Yep64f c9 = 2.755731339913502e-06;
1391  const Yep64f c10 = -2.755731424340092e-07;
1392  const Yep64f c11 = -2.505071756776031e-08;
1393  const Yep64f c12 = 2.0875709384133097e-09;
1394  const Yep64f c13 = 1.58946757299646e-10;
1395  const Yep64f c14 = -1.135896887279365e-11;
1396 #endif
1397 
1398  Yep64f t = x * twoOPi + magicBias;
1399  const Yep32u n = yepBuiltin_GetLowPart_64u_32u(yepBuiltin_Cast_64f_64u(t));
1400  t -= magicBias;
1401  x += t * minusPio2_hi;
1402  const Yep64f a = x;
1403  const Yep64f midProduct = t * minusPio2_me;
1404  x += midProduct;
1405  const Yep64f r = midProduct - (x - a);
1406  x += (t * minusPio2_lo + r);
1407 
1408  const Yep64f sqrX = x * x;
1409  Yep64f sinX = c13;
1410  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c11);
1411  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c9);
1412  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c7);
1413  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c5);
1414  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, sqrX, c3);
1415  sinX = sinX * sqrX;
1416  sinX = yepBuiltin_MultiplyAdd_64f64f64f_64f(sinX, x, x);
1417  Yep64f cosX = c14;
1418  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c12);
1419  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c10);
1420  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c8);
1421  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c6);
1422  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c4);
1423  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c2);
1424  cosX = yepBuiltin_MultiplyAdd_64f64f64f_64f(cosX, sqrX, c0);
1425 
1426  return (n & 1) ? (-cosX / sinX) : (sinX / cosX);
1427 }
1428 
1429 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1430  #pragma intrinsic(sqrt)
1431 #endif
1432 
1433 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32f yepBuiltin_Sqrt_32f_32f(Yep32f x) {
1434 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1435  return sqrt(x);
1436 #elif defined(YEP_NVIDIA_COMPILER)
1437  return __fsqrt_rn(x);
1438 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
1439  return __builtin_sqrtf(x);
1440 #elif defined(YEP_ARM_COMPILER)
1441  return __sqrtf(x);
1442 #else
1443  #error "Compiler-specific implementation needed"
1444 #endif
1445 }
1446 
1447 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_Sqrt_64f_64f(Yep64f x) {
1448 #if defined(YEP_MSVC_COMPATIBLE_COMPILER)
1449  return sqrt(x);
1450 #elif defined(YEP_NVIDIA_COMPILER)
1451  return __dsqrt_rn(x);
1452 #elif defined(YEP_GCC_COMPATIBLE_COMPILER)
1453  return __builtin_sqrt(x);
1454 #elif defined(YEP_ARM_COMPILER)
1455  return __sqrt(x);
1456 #else
1457  #error "Compiler-specific implementation needed"
1458 #endif
1459 }
1460 
1461 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64f yepBuiltin_ArcSin_64f_64f(Yep64f x) {
1462 #if defined(YEP_COMPILER_SUPPORTS_HEXADECIMAL_FLOATING_POINT_CONSTANTS)
1463  const Yep64f half = 0x1.0000000000000p-1;
1464  const Yep64f ac3 = 0x1.5555555555332p-3;
1465  const Yep64f ac5 = 0x1.33333333768C7p-4;
1466  const Yep64f ac7 = 0x1.6DB6DB3E4DA8Ap-5;
1467  const Yep64f ac9 = 0x1.F1C72D5B739EFp-6;
1468  const Yep64f ac11 = 0x1.6E89DC94F7B19p-6;
1469  const Yep64f ac13 = 0x1.1C6D1EE2BF355p-6;
1470  const Yep64f ac15 = 0x1.C6E7A6CA04E0Dp-7;
1471  const Yep64f ac17 = 0x1.8F47A67BD13CFp-7;
1472  const Yep64f ac19 = 0x1.A7AC3B4A38FB8p-8;
1473  const Yep64f ac21 = 0x1.4296C857308B2p-6;
1474  const Yep64f ac23 = -0x1.0DB1C05152E38p-6;
1475  const Yep64f ac25 = 0x1.06AD1B749C8D4p-5;
1476 
1477  const Yep64f bc0 = 0x1.921FB54442D18p+0;
1478  const Yep64f bc1 = 0x1.6A09E667F3BC7p+1;
1479  const Yep64f bc3 = -0x1.E2B7DDDFF06ACp-2;
1480  const Yep64f bc5 = 0x1.B27247B01E1B8p-3;
1481  const Yep64f bc7 = -0x1.02995B468EBC5p-3;
1482  const Yep64f bc9 = 0x1.5FFB7742ECDC6p-4;
1483  const Yep64f bc11 = -0x1.032E1D4CDEC75p-4;
1484  const Yep64f bc13 = 0x1.924AF9192AF6Ap-5;
1485  const Yep64f bc15 = -0x1.41264A779EBFFp-5;
1486  const Yep64f bc17 = 0x1.1D9B9AF0438A1p-5;
1487  const Yep64f bc19 = -0x1.106A0643EEB6Cp-6;
1488  const Yep64f bc21 = 0x1.EBCC69FBEBEC2p-5;
1489  const Yep64f bc23 = 0x1.B2DE37FA33AAAp-5;
1490  const Yep64f bc25 = 0x1.8509940B63DD2p-4;
1491 #else
1492  const Yep64f half = 0.5;
1493  const Yep64f ac3 = 0.16666666666665148;
1494  const Yep64f ac5 = 0.07500000000382832;
1495  const Yep64f ac7 = 0.044642856797897215;
1496  const Yep64f ac9 = 0.03038196019570621;
1497  const Yep64f ac11 = 0.02237173596574413;
1498  const Yep64f ac13 = 0.017360000764699752;
1499  const Yep64f ac15 = 0.013882595481880445;
1500  const Yep64f ac17 = 0.01218505505642922;
1501  const Yep64f ac19 = 0.006464733576851893;
1502  const Yep64f ac21 = 0.019689269681074158;
1503  const Yep64f ac23 = -0.016460836229539505;
1504  const Yep64f ac25 = 0.03206496584324872;
1505 
1506  const Yep64f bc0 = 1.5707963267948966;
1507  const Yep64f bc1 = 2.8284271247461876;
1508  const Yep64f bc3 = -0.4714045207912061;
1509  const Yep64f bc5 = 0.21213203436105998;
1510  const Yep64f bc7 = -0.12626906689714992;
1511  const Yep64f bc9 = 0.08593317591185387;
1512  const Yep64f bc11 = -0.0632764000455824;
1513  const Yep64f bc13 = 0.04910801555646922;
1514  const Yep64f bc15 = -0.03920282883060366;
1515  const Yep64f bc17 = 0.034864237417523876;
1516  const Yep64f bc19 = -0.01662684070445712;
1517  const Yep64f bc21 = 0.060033995628484785;
1518  const Yep64f bc23 = 0.053084477740062155;
1519  const Yep64f bc25 = 0.0949798377025595;
1520 #endif
1521  const Yep64f absX = yepBuiltin_Abs_64f_64f(x);
1522  if YEP_LIKELY(absX <= 1.0) {
1523  if (absX <= half) {
1524  const Yep64f ax = x;
1525  const Yep64f ax2 = ax * ax;
1526  Yep64f af = ac25;
1527  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac23);
1528  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac21);
1529  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac19);
1530  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac17);
1531  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac15);
1532  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac13);
1533  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac11);
1534  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac9);
1535  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac7);
1536  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac5);
1537  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af, ax2, ac3);
1538  af = yepBuiltin_MultiplyAdd_64f64f64f_64f(af * ax2, ax, ax);
1539  return af;
1540  } else {
1541  const Yep64f bx2 = absX * half - half;
1542  const Yep64f bx = yepBuiltin_Sqrt_64f_64f(bx2);
1543  Yep64f bf = bc25;
1544  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc23);
1545  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc21);
1546  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc19);
1547  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc17);
1548  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc15);
1549  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc13);
1550  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc11);
1551  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc9);
1552  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc7);
1553  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc5);
1554  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc3);
1555  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx2, bc1);
1556  bf = yepBuiltin_MultiplyAdd_64f64f64f_64f(bf, bx, bc0);
1557  return x > 0.0 ? bf : -bf;
1558  }
1559  } else {
1560  if (yepBuiltin_IsNaN_64f(absX)) {
1561  return x;
1562  } else {
1563  return yepBuiltin_NaN_64f();
1564  }
1565  }
1566 }
1567 
1568 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64u yepBuiltin_Multiply_32u32u_64u(Yep32u x, Yep32u y) {
1569 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1570  return __emulu(x, y);
1571 #else
1572  return Yep64u(x) * Yep64u(y);
1573 #endif
1574 }
1575 
1576 YEP_NATIVE_FUNCTION static YEP_INLINE Yep64s yepBuiltin_Multiply_32s32s_64s(Yep32s x, Yep32s y) {
1577 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1578  return __emul(x, y);
1579 #else
1580  return Yep64s(x) * Yep64s(y);
1581 #endif
1582 }
1583 
1584 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32u yepBuiltin_MultiplyHigh_32u32u_32u(Yep32u x, Yep32u y) {
1585 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1586  return Yep32u(__emulu(x, y) >> 32);
1587 #elif defined(YEP_NVIDIA_COMPILER)
1588  return __umulhi(x, y);
1589 #else
1590  return Yep32u(Yep64u(x) * Yep64u(y) >> 32);
1591 #endif
1592 }
1593 
1594 YEP_NATIVE_FUNCTION static YEP_INLINE Yep32s yepBuiltin_MultiplyHigh_32s32s_32s(Yep32s x, Yep32s y) {
1595 #if defined(YEP_MICROSOFT_COMPILER) && defined(YEP_X86_CPU)
1596  return Yep32s(Yep32u(Yep64u(__emul(x, y)) >> 32));
1597 #elif defined(YEP_NVIDIA_COMPILER)
1598  return __mulhi(x, y);
1599 #else
1600  return Yep32s(Yep32u(Yep64u(Yep64s(x) * Yep64s(y)) >> 32));
1601 #endif
1602 }
1603 
1604 #if defined(YEP_MSVC_COMPATIBLE_COMPILER) && (defined(YEP_X64_ABI) || defined(YEP_IA64_ABI))
1605 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128u yepBuiltin_Multiply_64u64u_128u(Yep64u x, Yep64u y) {
1606  Yep128u result;
1607  result.low = _umul128(x, y, &result.high);
1608  return result;
1609 }
1610 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X64_ABI)
1611 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128u yepBuiltin_Multiply_64u64u_128u(Yep64u x, Yep64u y) {
1612  const __uint128_t product = ((__uint128_t)x) * ((__uint128_t)y);
1613  Yep128u result;
1614  result.low = Yep64u(product);
1615  result.high = Yep64u(product >> 64);
1616  return result;
1617 }
1618 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_CUDA_GPU)
1619 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128u yepBuiltin_Multiply_64u64u_128u(Yep64u x, Yep64u y) {
1620  Yep128u result;
1621  result.low = x * y;
1622  result.high = __umul64hi(x, y);
1623  return result;
1624 }
1625 #endif
1626 
1627 #if defined(YEP_MSVC_COMPATIBLE_COMPILER) && (defined(YEP_X64_ABI) || defined(YEP_IA64_ABI))
1628 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128s yepBuiltin_Multiply_64s64s_128s(Yep64s x, Yep64s y) {
1629  Yep128s result;
1630  __int64 highPart;
1631  result.low = _mul128(x, y, &highPart);
1632  result.high = highPart;
1633  return result;
1634 }
1635 #elif defined(YEP_GCC_COMPATIBLE_COMPILER) && defined(YEP_X64_ABI)
1636 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128s yepBuiltin_Multiply_64s64s_128s(Yep64s x, Yep64s y) {
1637  const __int128_t product = ((__int128_t)x) * ((__int128_t)y);
1638  Yep128s result;
1639  result.low = Yep64u(product);
1640  result.high = Yep64u(((__uint128_t)product) >> 64);
1641  return result;
1642 }
1643 #elif defined(YEP_NVIDIA_COMPILER) && defined(YEP_CUDA_GPU)
1644 YEP_NATIVE_FUNCTION static YEP_INLINE Yep128s yepBuiltin_Multiply_64s64s_128s(Yep64s x, Yep64s y) {
1645  Yep128s result;
1646  result.low = x * y;
1647  result.high = __mul64hi(x, y);
1648  return result;
1649 }
1650 #endif
1651 
1652 /* Emulation of __cpuid, __cpuidex, and _xgetbv intrinsics on x86 and x86-64 */
1653 #if defined(YEP_X86_CPU)
1654  #if defined(YEP_GCC_COMPATIBLE_COMPILER)
1655  #if defined(YEP_X86_ABI) && defined(YEP_PIC)
1656  static YEP_INLINE void __cpuid(int CPUInfo[4], int InfoType) {
1657  CPUInfo[0] = InfoType;
1658  asm volatile (
1659  "movl %%ebx, %%edi;"
1660  "cpuid;"
1661  "xchgl %%ebx, %%edi;"
1662  :"+a" (CPUInfo[0]), "=D" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
1663  :
1664  :
1665  );
1666  }
1667 
1668  static YEP_INLINE void __cpuidex(int CPUInfo[4], int InfoType, int ECXValue) {
1669  CPUInfo[0] = InfoType;
1670  CPUInfo[2] = ECXValue;
1671  asm volatile (
1672  "movl %%ebx, %%edi;"
1673  "cpuid;"
1674  "xchgl %%ebx, %%edi;"
1675  :"+a" (CPUInfo[0]), "=D" (CPUInfo[1]), "+c" (CPUInfo[2]), "=d" (CPUInfo[3])
1676  :
1677  :
1678  );
1679  }
1680  #else
1681  static YEP_INLINE void __cpuid(int CPUInfo[4], int InfoType) {
1682  CPUInfo[0] = InfoType;
1683  asm volatile (
1684  "cpuid;"
1685  :"+a" (CPUInfo[0]), "=b" (CPUInfo[1]), "=c" (CPUInfo[2]), "=d" (CPUInfo[3])
1686  :
1687  :
1688  );
1689  }
1690 
1691  static YEP_INLINE void __cpuidex(int CPUInfo[4], int InfoType, int ECXValue) {
1692  CPUInfo[0] = InfoType;
1693  CPUInfo[2] = ECXValue;
1694  asm volatile (
1695  "cpuid;"
1696  :"+a" (CPUInfo[0]), "=b" (CPUInfo[1]), "+c" (CPUInfo[2]), "=d" (CPUInfo[3])
1697  :
1698  :
1699  );
1700  }
1701  #endif
1702 
1703  #if !defined(YEP_INTEL_COMPILER) && !defined(YEP_K1OM_X64_ABI)
1704  static YEP_INLINE Yep64u _xgetbv(Yep32u ext_ctrl_reg) {
1705  Yep32u lo, hi;
1706  asm volatile (
1707  "xgetbv"
1708  : "=a"(lo), "=d"(hi)
1709  : "c"(ext_ctrl_reg)
1710  :
1711  );
1712  return (Yep64u(hi) << 32) | Yep64u(lo);
1713  }
1714  #endif
1715  #elif defined(YEP_MSVC_COMPATIBLE_COMPILER)
1716  /* __cpuidex intrinsic is not suppored until Visual Studio 2008 SP1 */
1717  #if defined(YEP_MICROSOFT_COMPILER) && _MSC_FULL_VER < 150030729
1718  #pragma section(".text")
1719 
1720  #if defined(YEP_X86_CPU)
1721  /* fastcall: first argument in ecx, second in edx, third in [esp + 4] */
1722 
1723  __declspec(allocate(".text")) static const char __cpuidex_bytecode[] =
1724  "\x53\x56\x8B\x74\x24\x0C\x89\xD0\x0F\xA2\x89\x06\x89\x5E\x04\x89";
1725 
1726  #else
1727  /* x64: first argument in ecx, second in edx, third in r8 */
1728 
1729  __declspec(allocate(".text")) static const char __cpuidex_bytecode[] =
1730  "\x53\x89\xD0\x0F\xA2\x41\x89\x00\x41\x89\x58\x04\x41\x89\x48\x08\x41\x89\x50\x0C\x5B\xC3";
1731 
1732  #endif
1733 
1734  typedef void(__fastcall *CpuidexPointer)(int, int, int[4]);
1735 
1736  static YEP_INLINE void __cpuidex(int CPUInfo[4], int InfoType, int ECXValue) {
1737  (CpuidexPointer(&__cpuidex_bytecode))(ECXValue, InfoType, CPUInfo);
1738  }
1739 
1740  #endif
1741  /* _xgetbv intrinsic is not supported until Visual Studio 2010 SP1 */
1742  #if defined(YEP_MICROSOFT_COMPILER) && _MSC_FULL_VER < 160040219
1743  #pragma section(".text")
1744 
1745  #if defined(YEP_X86_CPU)
1746  /* fastcall: first argument in ecx, second in edx, third in [esp + 4] */
1747 
1748  __declspec(allocate(".text")) static const char _xgetbv_bytecode[] =
1749  "\x0F\x01\xD0\xC3";
1750 
1751  #else
1752  /* x64: first argument in ecx, second in edx, third in r8 */
1753  __declspec(allocate(".text")) static const char _xgetbv_bytecode[] =
1754  "\x0F\x01\xD0\x48\xC1\xE2\x20\x48\x09\xD0\xC3";
1755 
1756  #endif
1757 
1758  typedef Yep64u(__fastcall *XgetbvPointer)(Yep32u);
1759  typedef void(__fastcall *CpuidexPointer)(int, int, int[4]);
1760 
1761  static YEP_INLINE Yep64u _xgetbv(Yep32u ext_ctrl_reg) {
1762  return (XgetbvPointer(&_xgetbv_bytecode))(ext_ctrl_reg);
1763  }
1764  #elif !defined(YEP_INTEL_COMPILER)
1765  /* Visual Stidio 2010 SP1: _xgetbv intrinsic is supported, but not declared */
1766  extern "C" unsigned __int64 __cdecl _xgetbv(unsigned int ext_ctrl_reg);
1767  #pragma intrinsic(_xgetbv)
1768  #endif
1769  #endif
1770 #endif