done
This commit is contained in:
@ -0,0 +1,27 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
float *src = (float*)argv[argc-1];
|
||||
float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
|
||||
/* MAXMIN */
|
||||
int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
|
||||
ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
|
||||
/* ROUNDING */
|
||||
ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
|
||||
#ifdef __aarch64__
|
||||
{
|
||||
double *src2 = (double*)argv[argc-1];
|
||||
float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
|
||||
/* MAXMIN */
|
||||
ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
|
||||
ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
|
||||
/* ROUNDING */
|
||||
ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
unsigned char *src = (unsigned char*)argv[argc-1];
|
||||
uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]);
|
||||
uint32x4_t va = vdupq_n_u32(3);
|
||||
int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
|
||||
#ifdef __aarch64__
|
||||
ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
float16_t *src = (float16_t*)argv[argc-1];
|
||||
float *src2 = (float*)argv[argc-2];
|
||||
float16x8_t vhp = vdupq_n_f16(src[0]);
|
||||
float16x4_t vlhp = vdup_n_f16(src[1]);
|
||||
float32x4_t vf = vdupq_n_f32(src2[0]);
|
||||
float32x2_t vlf = vdup_n_f32(src2[1]);
|
||||
|
||||
int ret = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0);
|
||||
ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0);
|
||||
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,15 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
float16_t *src = (float16_t*)argv[argc-1];
|
||||
float16x8_t vhp = vdupq_n_f16(src[0]);
|
||||
float16x4_t vlhp = vdup_n_f16(src[1]);
|
||||
|
||||
int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
|
||||
ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __AVX__
|
||||
#error "HOST/ARCH doesn't support AVX"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m256 a = _mm256_add_ps(_mm256_loadu_ps((const float*)argv[argc-1]), _mm256_loadu_ps((const float*)argv[1]));
|
||||
return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __AVX2__
|
||||
#error "HOST/ARCH doesn't support AVX2"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m256i a = _mm256_abs_epi16(_mm256_loadu_si256((const __m256i*)argv[argc-1]));
|
||||
return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __AVX512VNNI__
|
||||
#error "HOST/ARCH doesn't support CascadeLake AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
/* VNNI */
|
||||
__m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
|
||||
a = _mm512_dpbusd_epi32(a, _mm512_setzero_si512(), a);
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,24 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX512VBMI__) || !defined(__AVX512IFMA__)
|
||||
#error "HOST/ARCH doesn't support CannonLake AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
|
||||
/* IFMA */
|
||||
a = _mm512_madd52hi_epu64(a, a, _mm512_setzero_si512());
|
||||
/* VMBI */
|
||||
a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), a);
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512BITALG__) || !defined(__AVX512VPOPCNTDQ__)
|
||||
#error "HOST/ARCH doesn't support IceLake AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
|
||||
/* VBMI2 */
|
||||
a = _mm512_shrdv_epi64(a, a, _mm512_setzero_si512());
|
||||
/* BITLAG */
|
||||
a = _mm512_popcnt_epi8(a);
|
||||
/* VPOPCNTDQ */
|
||||
a = _mm512_popcnt_epi64(a);
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX512ER__) || !defined(__AVX512PF__)
|
||||
#error "HOST/ARCH doesn't support Knights Landing AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int base[128]={};
|
||||
__m512d ad = _mm512_loadu_pd((const __m512d*)argv[argc-1]);
|
||||
/* ER */
|
||||
__m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(ad));
|
||||
/* PF */
|
||||
_mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
|
||||
return base[0];
|
||||
}
|
@ -0,0 +1,30 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX5124FMAPS__) || !defined(__AVX5124VNNIW__) || !defined(__AVX512VPOPCNTDQ__)
|
||||
#error "HOST/ARCH doesn't support Knights Mill AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
|
||||
__m512 b = _mm512_loadu_ps((const __m512*)argv[argc-2]);
|
||||
|
||||
/* 4FMAPS */
|
||||
b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
|
||||
/* 4VNNIW */
|
||||
a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
|
||||
/* VPOPCNTDQ */
|
||||
a = _mm512_popcnt_epi64(a);
|
||||
|
||||
a = _mm512_add_epi32(a, _mm512_castps_si512(b));
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX512VL__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__)
|
||||
#error "HOST/ARCH doesn't support SkyLake AVX512 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i aa = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
|
||||
/* VL */
|
||||
__m256i a = _mm256_abs_epi64(_mm512_extracti64x4_epi64(aa, 1));
|
||||
/* DQ */
|
||||
__m512i b = _mm512_broadcast_i32x8(a);
|
||||
/* BW */
|
||||
b = _mm512_abs_epi16(b);
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
|
||||
}
|
@ -0,0 +1,26 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__AVX512FP16__)
|
||||
#error "HOST/ARCH doesn't support Sapphire Rapids AVX512FP16 features"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
/* clang has a bug regarding our spr coode, see gh-23730. */
|
||||
#if __clang__
|
||||
#error
|
||||
#endif
|
||||
__m512h a = _mm512_loadu_ph((void*)argv[argc-1]);
|
||||
__m512h temp = _mm512_fmadd_ph(a, a, a);
|
||||
_mm512_storeu_ph((void*)(argv[argc-1]), temp);
|
||||
return 0;
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __AVX512CD__
|
||||
#error "HOST/ARCH doesn't support AVX512CD"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i a = _mm512_lzcnt_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __AVX512F__
|
||||
#error "HOST/ARCH doesn't support AVX512F"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m512i a = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
|
||||
return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __F16C__
|
||||
#error "HOST/ARCH doesn't support F16C"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m128 a = _mm_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-1]));
|
||||
__m256 a8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-2]));
|
||||
return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
|
||||
}
|
@ -0,0 +1,22 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__FMA__) && !defined(__AVX2__)
|
||||
#error "HOST/ARCH doesn't support FMA3"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <xmmintrin.h>
|
||||
#include <immintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
|
||||
a = _mm256_fmadd_ps(a, a, a);
|
||||
return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
#include <ammintrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
|
||||
a = _mm256_macc_ps(a, a, a);
|
||||
return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
|
||||
}
|
@ -0,0 +1,11 @@
|
||||
#ifndef __loongarch_sx
|
||||
#error "HOST/ARCH doesn't support LSX"
|
||||
#endif
|
||||
|
||||
#include <lsxintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128i a = __lsx_vadd_d(__lsx_vldi(0), __lsx_vldi(0));
|
||||
return __lsx_vpickve2gr_w(a, 0);
|
||||
}
|
@ -0,0 +1,19 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// passing from untraced pointers to avoid optimizing out any constants
|
||||
// so we can test against the linker.
|
||||
float *src = (float*)argv[argc-1];
|
||||
float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
|
||||
int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
|
||||
#ifdef __aarch64__
|
||||
double *src2 = (double*)argv[argc-2];
|
||||
float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
|
||||
ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,11 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
short *src = (short*)argv[argc-1];
|
||||
float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
|
||||
return (int)vgetq_lane_f32(v_z4, 0);
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <Intrin.h>
|
||||
#endif
|
||||
#include <arm_neon.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
float *src = (float*)argv[argc-1];
|
||||
float32x4_t v1 = vdupq_n_f32(src[0]);
|
||||
float32x4_t v2 = vdupq_n_f32(src[1]);
|
||||
float32x4_t v3 = vdupq_n_f32(src[2]);
|
||||
int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
|
||||
#ifdef __aarch64__
|
||||
double *src2 = (double*)argv[argc-2];
|
||||
float64x2_t vd1 = vdupq_n_f64(src2[0]);
|
||||
float64x2_t vd2 = vdupq_n_f64(src2[1]);
|
||||
float64x2_t vd3 = vdupq_n_f64(src2[2]);
|
||||
ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
|
||||
#endif
|
||||
return ret;
|
||||
}
|
@ -0,0 +1,32 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env vr `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#if !defined(__SSE4_2__) && !defined(__POPCNT__)
|
||||
#error "HOST/ARCH doesn't support POPCNT"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#include <nmmintrin.h>
|
||||
#else
|
||||
#include <popcntintrin.h>
|
||||
#endif
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
// To make sure popcnt instructions are generated
|
||||
// and been tested against the assembler
|
||||
unsigned long long a = *((unsigned long long*)argv[argc-1]);
|
||||
unsigned int b = *((unsigned int*)argv[argc-2]);
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
a = _mm_popcnt_u64(a);
|
||||
#endif
|
||||
b = _mm_popcnt_u32(b);
|
||||
return (int)a + b;
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
#ifndef __riscv_vector
|
||||
#error RVV not supported
|
||||
#endif
|
||||
|
||||
#include <riscv_vector.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
size_t vlmax = __riscv_vsetvlmax_e32m1();
|
||||
vuint32m1_t a = __riscv_vmv_v_x_u32m1(0, vlmax);
|
||||
vuint32m1_t b = __riscv_vadd_vv_u32m1(a, a, vlmax);
|
||||
return __riscv_vmv_x_s_u32m1_u32(b);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSE__
|
||||
#error "HOST/ARCH doesn't support SSE"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <xmmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
|
||||
return (int)_mm_cvtss_f32(a);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSE2__
|
||||
#error "HOST/ARCH doesn't support SSE2"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <emmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSE3__
|
||||
#error "HOST/ARCH doesn't support SSE3"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <pmmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
|
||||
return (int)_mm_cvtss_f32(a);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSE4_1__
|
||||
#error "HOST/ARCH doesn't support SSE41"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128 a = _mm_floor_ps(_mm_setzero_ps());
|
||||
return (int)_mm_cvtss_f32(a);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSE4_2__
|
||||
#error "HOST/ARCH doesn't support SSE42"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <smmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
|
||||
return (int)_mm_cvtss_f32(a);
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
|
||||
/*
|
||||
* Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
|
||||
* whether or not the build options for those features are specified.
|
||||
* Therefore, we must test #definitions of CPU features when option native/host
|
||||
* is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
|
||||
* the test will be broken and leads to enable all possible features.
|
||||
*/
|
||||
#ifndef __SSSE3__
|
||||
#error "HOST/ARCH doesn't support SSSE3"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include <tmmintrin.h>
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
return (int)_mm_cvtsi128_si32(a);
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
#include <arm_sve.h>
|
||||
|
||||
int accumulate(svint64_t a, svint64_t b) {
|
||||
svbool_t p = svptrue_b64();
|
||||
return svaddv(p, svmla_z(p, a, a, b));
|
||||
}
|
||||
|
||||
int main(void)
|
||||
{
|
||||
svbool_t p = svptrue_b64();
|
||||
svint64_t a = svdup_s64(1);
|
||||
svint64_t b = svdup_s64(2);
|
||||
return accumulate(a, b);
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
|
||||
#define vsx_ld vec_vsx_ld
|
||||
#define vsx_st vec_vsx_st
|
||||
#else
|
||||
#define vsx_ld vec_xl
|
||||
#define vsx_st vec_xst
|
||||
#endif
|
||||
|
||||
int main(void)
|
||||
{
|
||||
unsigned int zout[4];
|
||||
unsigned int z4[] = {0, 0, 0, 0};
|
||||
__vector unsigned int v_z4 = vsx_ld(0, z4);
|
||||
vsx_st(v_z4, 0, zout);
|
||||
return zout[0];
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned long long v_uint64x2;
|
||||
|
||||
int main(void)
|
||||
{
|
||||
v_uint64x2 z2 = (v_uint64x2){0, 0};
|
||||
z2 = (v_uint64x2)vec_cmpeq(z2, z2);
|
||||
return (int)vec_extract(z2, 0);
|
||||
}
|
@ -0,0 +1,13 @@
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned int v_uint32x4;
|
||||
|
||||
int main(void)
|
||||
{
|
||||
v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
|
||||
z4 = vec_absd(z4, z4);
|
||||
return (int)vec_extract(z4, 0);
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector unsigned int v_uint32x4;
|
||||
|
||||
int main(void)
|
||||
{
|
||||
v_uint32x4 v1 = (v_uint32x4){2, 4, 8, 16};
|
||||
v_uint32x4 v2 = (v_uint32x4){2, 2, 2, 2};
|
||||
v_uint32x4 v3 = vec_mod(v1, v2);
|
||||
return (int)vec_extractm(v3);
|
||||
}
|
16
lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
Normal file
16
lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
Normal file
@ -0,0 +1,16 @@
|
||||
#if (__VEC__ < 10301) || (__ARCH__ < 11)
|
||||
#error VX not supported
|
||||
#endif
|
||||
|
||||
#include <vecintrin.h>
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__vector double x = vec_abs(vec_xl(argc, (double*)argv));
|
||||
__vector double y = vec_load_len((double*)argv, (unsigned int)argc);
|
||||
|
||||
x = vec_round(vec_ceil(x) + vec_floor(y));
|
||||
__vector bool long long m = vec_cmpge(x, y);
|
||||
__vector long long i = vec_signed(vec_sel(x, y, m));
|
||||
|
||||
return (int)vec_extract(i, 0);
|
||||
}
|
@ -0,0 +1,25 @@
|
||||
#if (__VEC__ < 10302) || (__ARCH__ < 12)
|
||||
#error VXE not supported
|
||||
#endif
|
||||
|
||||
#include <vecintrin.h>
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
__vector float x = vec_nabs(vec_xl(argc, (float*)argv));
|
||||
__vector float y = vec_load_len((float*)argv, (unsigned int)argc);
|
||||
|
||||
x = vec_round(vec_ceil(x) + vec_floor(y));
|
||||
__vector bool int m = vec_cmpge(x, y);
|
||||
x = vec_sel(x, y, m);
|
||||
|
||||
// need to test the existence of intrin "vflls" since vec_doublee
|
||||
// is vec_doublee maps to wrong intrin "vfll".
|
||||
// see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871
|
||||
#if defined(__GNUC__) && !defined(__clang__)
|
||||
__vector long long i = vec_signed(__builtin_s390_vflls(x));
|
||||
#else
|
||||
__vector long long i = vec_signed(vec_doublee(x));
|
||||
#endif
|
||||
|
||||
return (int)vec_extract(i, 0);
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
#if (__VEC__ < 10303) || (__ARCH__ < 13)
|
||||
#error VXE2 not supported
|
||||
#endif
|
||||
|
||||
#include <vecintrin.h>
|
||||
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
int val;
|
||||
__vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' };
|
||||
__vector signed short search = { 'g', 'h', 'g', 'o' };
|
||||
__vector unsigned char len = { 0 };
|
||||
__vector unsigned char res = vec_search_string_cc(large, search, len, &val);
|
||||
__vector float x = vec_xl(argc, (float*)argv);
|
||||
__vector int i = vec_signed(x);
|
||||
|
||||
i = vec_srdb(vec_sldb(i, i, 2), i, 3);
|
||||
val += (int)vec_extract(res, 1);
|
||||
val += vec_extract(i, 0);
|
||||
return val;
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
#include <immintrin.h>
|
||||
#ifdef _MSC_VER
|
||||
#include <ammintrin.h>
|
||||
#else
|
||||
#include <x86intrin.h>
|
||||
#endif
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
return _mm_cvtsi128_si32(a);
|
||||
}
|
@ -0,0 +1,18 @@
|
||||
#include <immintrin.h>
|
||||
/**
|
||||
* Test BW mask operations due to:
|
||||
* - MSVC has supported it since vs2019 see,
|
||||
* https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
|
||||
* - Clang >= v8.0
|
||||
* - GCC >= v7.1
|
||||
*/
|
||||
int main(void)
|
||||
{
|
||||
__mmask64 m64 = _mm512_cmpeq_epi8_mask(_mm512_set1_epi8((char)1), _mm512_set1_epi8((char)1));
|
||||
m64 = _kor_mask64(m64, m64);
|
||||
m64 = _kxor_mask64(m64, m64);
|
||||
m64 = _cvtu64_mask64(_cvtmask64_u64(m64));
|
||||
m64 = _mm512_kunpackd(m64, m64);
|
||||
m64 = (__mmask64)_mm512_kunpackw((__mmask32)m64, (__mmask32)m64);
|
||||
return (int)_cvtmask64_u64(m64);
|
||||
}
|
@ -0,0 +1,16 @@
|
||||
#include <immintrin.h>
|
||||
/**
|
||||
* Test DQ mask operations due to:
|
||||
* - MSVC has supported it since vs2019 see,
|
||||
* https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
|
||||
* - Clang >= v8.0
|
||||
* - GCC >= v7.1
|
||||
*/
|
||||
int main(void)
|
||||
{
|
||||
__mmask8 m8 = _mm512_cmpeq_epi64_mask(_mm512_set1_epi64(1), _mm512_set1_epi64(1));
|
||||
m8 = _kor_mask8(m8, m8);
|
||||
m8 = _kxor_mask8(m8, m8);
|
||||
m8 = _cvtu32_mask8(_cvtmask8_u32(m8));
|
||||
return (int)_cvtmask8_u32(m8);
|
||||
}
|
@ -0,0 +1,41 @@
|
||||
#include <immintrin.h>
|
||||
/**
|
||||
* The following intrinsics don't have direct native support but compilers
|
||||
* tend to emulate them.
|
||||
* They're usually supported by gcc >= 7.1, clang >= 4 and icc >= 19
|
||||
*/
|
||||
int main(void)
|
||||
{
|
||||
__m512 one_ps = _mm512_set1_ps(1.0f);
|
||||
__m512d one_pd = _mm512_set1_pd(1.0);
|
||||
__m512i one_i64 = _mm512_set1_epi64(1);
|
||||
// add
|
||||
float sum_ps = _mm512_reduce_add_ps(one_ps);
|
||||
double sum_pd = _mm512_reduce_add_pd(one_pd);
|
||||
int sum_int = (int)_mm512_reduce_add_epi64(one_i64);
|
||||
sum_int += (int)_mm512_reduce_add_epi32(one_i64);
|
||||
// mul
|
||||
sum_ps += _mm512_reduce_mul_ps(one_ps);
|
||||
sum_pd += _mm512_reduce_mul_pd(one_pd);
|
||||
sum_int += (int)_mm512_reduce_mul_epi64(one_i64);
|
||||
sum_int += (int)_mm512_reduce_mul_epi32(one_i64);
|
||||
// min
|
||||
sum_ps += _mm512_reduce_min_ps(one_ps);
|
||||
sum_pd += _mm512_reduce_min_pd(one_pd);
|
||||
sum_int += (int)_mm512_reduce_min_epi32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_min_epu32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_min_epi64(one_i64);
|
||||
// max
|
||||
sum_ps += _mm512_reduce_max_ps(one_ps);
|
||||
sum_pd += _mm512_reduce_max_pd(one_pd);
|
||||
sum_int += (int)_mm512_reduce_max_epi32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_max_epu32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_max_epi64(one_i64);
|
||||
// and
|
||||
sum_int += (int)_mm512_reduce_and_epi32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_and_epi64(one_i64);
|
||||
// or
|
||||
sum_int += (int)_mm512_reduce_or_epi32(one_i64);
|
||||
sum_int += (int)_mm512_reduce_or_epi64(one_i64);
|
||||
return (int)sum_ps + (int)sum_pd + sum_int;
|
||||
}
|
@ -0,0 +1,12 @@
|
||||
/**
|
||||
* Assembler may not fully support the following VSX3 scalar
|
||||
* instructions, even though compilers report VSX3 support.
|
||||
*/
|
||||
int main(void)
|
||||
{
|
||||
unsigned short bits = 0xFF;
|
||||
double f;
|
||||
__asm__ __volatile__("xscvhpdp %x0,%x1" : "=wa"(f) : "wa"(bits));
|
||||
__asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits) : "wa" (f));
|
||||
return bits;
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
typedef __vector float fv4sf_t;
|
||||
typedef __vector unsigned char vec_t;
|
||||
|
||||
int main(void)
|
||||
{
|
||||
__vector_quad acc0;
|
||||
float a[4] = {0,1,2,3};
|
||||
float b[4] = {0,1,2,3};
|
||||
vec_t *va = (vec_t *) a;
|
||||
vec_t *vb = (vec_t *) b;
|
||||
__builtin_mma_xvf32ger(&acc0, va[0], vb[0]);
|
||||
fv4sf_t result[4];
|
||||
__builtin_mma_disassemble_acc((void *)result, &acc0);
|
||||
fv4sf_t c0 = result[0];
|
||||
return (int)((float*)&c0)[0];
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
/**
|
||||
* Testing ASM VSX register number fixer '%x<n>'
|
||||
*
|
||||
* old versions of CLANG doesn't support %x<n> in the inline asm template
|
||||
* which fixes register number when using any of the register constraints wa, wd, wf.
|
||||
*
|
||||
* xref:
|
||||
* - https://bugs.llvm.org/show_bug.cgi?id=31837
|
||||
* - https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
|
||||
*/
|
||||
#ifndef __VSX__
|
||||
#error "VSX is not supported"
|
||||
#endif
|
||||
#include <altivec.h>
|
||||
|
||||
#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
|
||||
#define vsx_ld vec_vsx_ld
|
||||
#define vsx_st vec_vsx_st
|
||||
#else
|
||||
#define vsx_ld vec_xl
|
||||
#define vsx_st vec_xst
|
||||
#endif
|
||||
|
||||
int main(void)
|
||||
{
|
||||
float z4[] = {0, 0, 0, 0};
|
||||
signed int zout[] = {0, 0, 0, 0};
|
||||
|
||||
__vector float vz4 = vsx_ld(0, z4);
|
||||
__vector signed int asm_ret = vsx_ld(0, zout);
|
||||
|
||||
__asm__ ("xvcvspsxws %x0,%x1" : "=wa" (vz4) : "wa" (asm_ret));
|
||||
|
||||
vsx_st(asm_ret, 0, zout);
|
||||
return zout[0];
|
||||
}
|
@ -0,0 +1 @@
|
||||
int test_flags;
|
Reference in New Issue
Block a user