done
This commit is contained in:
		| @ -0,0 +1,27 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     float *src = (float*)argv[argc-1]; | ||||
|     float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]); | ||||
|     /* MAXMIN */ | ||||
|     int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0); | ||||
|         ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0); | ||||
|     /* ROUNDING */ | ||||
|     ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0); | ||||
| #ifdef __aarch64__ | ||||
|     { | ||||
|         double *src2 = (double*)argv[argc-1]; | ||||
|         float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]); | ||||
|         /* MAXMIN */ | ||||
|         ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0); | ||||
|         ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0); | ||||
|         /* ROUNDING */ | ||||
|         ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0); | ||||
|     } | ||||
| #endif | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,16 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     unsigned char *src = (unsigned char*)argv[argc-1]; | ||||
|     uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]); | ||||
|     uint32x4_t va = vdupq_n_u32(3); | ||||
|     int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0); | ||||
| #ifdef __aarch64__ | ||||
|     ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0); | ||||
| #endif | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,19 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     float16_t *src = (float16_t*)argv[argc-1]; | ||||
|     float *src2 = (float*)argv[argc-2]; | ||||
|     float16x8_t vhp  = vdupq_n_f16(src[0]); | ||||
|     float16x4_t vlhp = vdup_n_f16(src[1]); | ||||
|     float32x4_t vf   = vdupq_n_f32(src2[0]); | ||||
|     float32x2_t vlf  = vdup_n_f32(src2[1]); | ||||
|  | ||||
|     int ret  = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0); | ||||
|         ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0); | ||||
|  | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,15 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     float16_t *src = (float16_t*)argv[argc-1]; | ||||
|     float16x8_t vhp  = vdupq_n_f16(src[0]); | ||||
|     float16x4_t vlhp = vdup_n_f16(src[1]); | ||||
|  | ||||
|     int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0); | ||||
|         ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0); | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __AVX__ | ||||
|         #error "HOST/ARCH doesn't support AVX" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m256 a = _mm256_add_ps(_mm256_loadu_ps((const float*)argv[argc-1]), _mm256_loadu_ps((const float*)argv[1])); | ||||
|     return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __AVX2__ | ||||
|         #error "HOST/ARCH doesn't support AVX2" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m256i a = _mm256_abs_epi16(_mm256_loadu_si256((const __m256i*)argv[argc-1])); | ||||
|     return _mm_cvtsi128_si32(_mm256_castsi256_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,22 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __AVX512VNNI__ | ||||
|         #error "HOST/ARCH doesn't support CascadeLake AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     /* VNNI */ | ||||
|     __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]); | ||||
|             a = _mm512_dpbusd_epi32(a, _mm512_setzero_si512(), a); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,24 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX512VBMI__) || !defined(__AVX512IFMA__) | ||||
|         #error "HOST/ARCH doesn't support CannonLake AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]); | ||||
|     /* IFMA */ | ||||
|     a = _mm512_madd52hi_epu64(a, a, _mm512_setzero_si512()); | ||||
|     /* VMBI */ | ||||
|     a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), a); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,26 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512BITALG__) || !defined(__AVX512VPOPCNTDQ__) | ||||
|         #error "HOST/ARCH doesn't support IceLake AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]); | ||||
|     /* VBMI2 */ | ||||
|     a = _mm512_shrdv_epi64(a, a, _mm512_setzero_si512()); | ||||
|     /* BITLAG */ | ||||
|     a = _mm512_popcnt_epi8(a); | ||||
|     /* VPOPCNTDQ */ | ||||
|     a = _mm512_popcnt_epi64(a); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,25 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX512ER__) || !defined(__AVX512PF__) | ||||
|         #error "HOST/ARCH doesn't support Knights Landing AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     int base[128]={}; | ||||
|     __m512d ad = _mm512_loadu_pd((const __m512d*)argv[argc-1]); | ||||
|     /* ER */ | ||||
|     __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(ad)); | ||||
|     /* PF */ | ||||
|     _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1); | ||||
|     return base[0]; | ||||
| } | ||||
| @ -0,0 +1,30 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX5124FMAPS__) || !defined(__AVX5124VNNIW__) || !defined(__AVX512VPOPCNTDQ__) | ||||
|         #error "HOST/ARCH doesn't support Knights Mill AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]); | ||||
|     __m512 b = _mm512_loadu_ps((const __m512*)argv[argc-2]); | ||||
|  | ||||
|     /* 4FMAPS */ | ||||
|     b = _mm512_4fmadd_ps(b, b, b, b, b, NULL); | ||||
|     /* 4VNNIW */ | ||||
|     a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL); | ||||
|     /* VPOPCNTDQ */ | ||||
|     a = _mm512_popcnt_epi64(a); | ||||
|  | ||||
|     a = _mm512_add_epi32(a, _mm512_castps_si512(b)); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,26 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX512VL__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__) | ||||
|         #error "HOST/ARCH doesn't support SkyLake AVX512 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i aa = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1])); | ||||
|     /* VL */ | ||||
|     __m256i a = _mm256_abs_epi64(_mm512_extracti64x4_epi64(aa, 1)); | ||||
|     /* DQ */ | ||||
|     __m512i b = _mm512_broadcast_i32x8(a); | ||||
|     /* BW */ | ||||
|     b = _mm512_abs_epi16(b); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(b)); | ||||
| } | ||||
| @ -0,0 +1,26 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__AVX512FP16__) | ||||
|         #error "HOST/ARCH doesn't support Sapphire Rapids AVX512FP16 features" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
| /* clang has a bug regarding our spr coode, see gh-23730. */ | ||||
| #if __clang__ | ||||
| #error | ||||
| #endif | ||||
|     __m512h a = _mm512_loadu_ph((void*)argv[argc-1]); | ||||
|     __m512h temp = _mm512_fmadd_ph(a, a, a); | ||||
|     _mm512_storeu_ph((void*)(argv[argc-1]), temp); | ||||
|     return 0; | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __AVX512CD__ | ||||
|         #error "HOST/ARCH doesn't support AVX512CD" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i a = _mm512_lzcnt_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1])); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __AVX512F__ | ||||
|         #error "HOST/ARCH doesn't support AVX512F" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m512i a = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1])); | ||||
|     return _mm_cvtsi128_si32(_mm512_castsi512_si128(a)); | ||||
| } | ||||
| @ -0,0 +1,22 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __F16C__ | ||||
|         #error "HOST/ARCH doesn't support F16C" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <emmintrin.h> | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m128 a  = _mm_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-1])); | ||||
|     __m256 a8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-2])); | ||||
|     return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8))); | ||||
| } | ||||
| @ -0,0 +1,22 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__FMA__) && !defined(__AVX2__) | ||||
|         #error "HOST/ARCH doesn't support FMA3" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <xmmintrin.h> | ||||
| #include <immintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]); | ||||
|            a = _mm256_fmadd_ps(a, a, a); | ||||
|     return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); | ||||
| } | ||||
| @ -0,0 +1,13 @@ | ||||
| #include <immintrin.h> | ||||
| #ifdef _MSC_VER | ||||
|     #include <ammintrin.h> | ||||
| #else | ||||
|     #include <x86intrin.h> | ||||
| #endif | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]); | ||||
|            a = _mm256_macc_ps(a, a, a); | ||||
|     return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a)); | ||||
| } | ||||
| @ -0,0 +1,11 @@ | ||||
| #ifndef __loongarch_sx | ||||
| #error "HOST/ARCH doesn't support LSX" | ||||
| #endif | ||||
|  | ||||
| #include <lsxintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128i a = __lsx_vadd_d(__lsx_vldi(0), __lsx_vldi(0)); | ||||
|     return __lsx_vpickve2gr_w(a, 0); | ||||
| } | ||||
| @ -0,0 +1,19 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     // passing from untraced pointers to avoid optimizing out any constants | ||||
|     // so we can test against the linker. | ||||
|     float *src = (float*)argv[argc-1]; | ||||
|     float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]); | ||||
|     int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0); | ||||
| #ifdef __aarch64__ | ||||
|     double *src2 = (double*)argv[argc-2]; | ||||
|     float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]); | ||||
|     ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0); | ||||
| #endif | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,11 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     short *src = (short*)argv[argc-1]; | ||||
|     float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src)); | ||||
|     return (int)vgetq_lane_f32(v_z4, 0); | ||||
| } | ||||
| @ -0,0 +1,21 @@ | ||||
| #ifdef _MSC_VER | ||||
|     #include <Intrin.h> | ||||
| #endif | ||||
| #include <arm_neon.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     float *src = (float*)argv[argc-1]; | ||||
|     float32x4_t v1 = vdupq_n_f32(src[0]); | ||||
|     float32x4_t v2 = vdupq_n_f32(src[1]); | ||||
|     float32x4_t v3 = vdupq_n_f32(src[2]); | ||||
|     int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0); | ||||
| #ifdef __aarch64__ | ||||
|     double *src2 = (double*)argv[argc-2]; | ||||
|     float64x2_t vd1 = vdupq_n_f64(src2[0]); | ||||
|     float64x2_t vd2 = vdupq_n_f64(src2[1]); | ||||
|     float64x2_t vd3 = vdupq_n_f64(src2[2]); | ||||
|     ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0); | ||||
| #endif | ||||
|     return ret; | ||||
| } | ||||
| @ -0,0 +1,32 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env vr `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #if !defined(__SSE4_2__) && !defined(__POPCNT__) | ||||
|         #error "HOST/ARCH doesn't support POPCNT" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #ifdef _MSC_VER | ||||
|     #include <nmmintrin.h> | ||||
| #else | ||||
|     #include <popcntintrin.h> | ||||
| #endif | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     // To make sure popcnt instructions are generated | ||||
|     // and been tested against the assembler | ||||
|     unsigned long long a = *((unsigned long long*)argv[argc-1]); | ||||
|     unsigned int b = *((unsigned int*)argv[argc-2]); | ||||
|  | ||||
| #if defined(_M_X64) || defined(__x86_64__) | ||||
|     a = _mm_popcnt_u64(a); | ||||
| #endif | ||||
|     b = _mm_popcnt_u32(b); | ||||
|     return (int)a + b; | ||||
| } | ||||
| @ -0,0 +1,13 @@ | ||||
| #ifndef __riscv_vector | ||||
|   #error RVV not supported | ||||
| #endif | ||||
|  | ||||
| #include <riscv_vector.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     size_t vlmax = __riscv_vsetvlmax_e32m1(); | ||||
|     vuint32m1_t a = __riscv_vmv_v_x_u32m1(0, vlmax); | ||||
|     vuint32m1_t b = __riscv_vadd_vv_u32m1(a, a, vlmax); | ||||
|     return __riscv_vmv_x_s_u32m1_u32(b); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSE__ | ||||
|         #error "HOST/ARCH doesn't support SSE" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <xmmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps()); | ||||
|     return (int)_mm_cvtss_f32(a); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSE2__ | ||||
|         #error "HOST/ARCH doesn't support SSE2" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <emmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128()); | ||||
|     return _mm_cvtsi128_si32(a); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSE3__ | ||||
|         #error "HOST/ARCH doesn't support SSE3" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <pmmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); | ||||
|     return (int)_mm_cvtss_f32(a); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSE4_1__ | ||||
|         #error "HOST/ARCH doesn't support SSE41" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <smmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128 a = _mm_floor_ps(_mm_setzero_ps()); | ||||
|     return (int)_mm_cvtss_f32(a); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSE4_2__ | ||||
|         #error "HOST/ARCH doesn't support SSE42" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <smmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps()); | ||||
|     return (int)_mm_cvtss_f32(a); | ||||
| } | ||||
| @ -0,0 +1,20 @@ | ||||
| #if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER) | ||||
|     /* | ||||
|      * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics, | ||||
|      * whether or not the build options for those features are specified. | ||||
|      * Therefore, we must test #definitions of CPU features when option native/host | ||||
|      * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise | ||||
|      * the test will be broken and leads to enable all possible features. | ||||
|      */ | ||||
|     #ifndef __SSSE3__ | ||||
|         #error "HOST/ARCH doesn't support SSSE3" | ||||
|     #endif | ||||
| #endif | ||||
|  | ||||
| #include <tmmintrin.h> | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128()); | ||||
|     return (int)_mm_cvtsi128_si32(a); | ||||
| } | ||||
| @ -0,0 +1,14 @@ | ||||
| #include <arm_sve.h> | ||||
|  | ||||
| int accumulate(svint64_t a, svint64_t b) { | ||||
|     svbool_t p = svptrue_b64(); | ||||
|     return svaddv(p, svmla_z(p, a, a, b)); | ||||
| } | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     svbool_t p = svptrue_b64(); | ||||
|     svint64_t a = svdup_s64(1); | ||||
|     svint64_t b = svdup_s64(2); | ||||
|     return accumulate(a, b); | ||||
| } | ||||
| @ -0,0 +1,21 @@ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) | ||||
|     #define vsx_ld  vec_vsx_ld | ||||
|     #define vsx_st  vec_vsx_st | ||||
| #else | ||||
|     #define vsx_ld  vec_xl | ||||
|     #define vsx_st  vec_xst | ||||
| #endif | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     unsigned int zout[4]; | ||||
|     unsigned int z4[] = {0, 0, 0, 0}; | ||||
|     __vector unsigned int v_z4 = vsx_ld(0, z4); | ||||
|     vsx_st(v_z4, 0, zout); | ||||
|     return zout[0]; | ||||
| } | ||||
| @ -0,0 +1,13 @@ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| typedef __vector unsigned long long v_uint64x2; | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     v_uint64x2 z2 = (v_uint64x2){0, 0}; | ||||
|     z2 = (v_uint64x2)vec_cmpeq(z2, z2); | ||||
|     return (int)vec_extract(z2, 0); | ||||
| } | ||||
| @ -0,0 +1,13 @@ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| typedef __vector unsigned int v_uint32x4; | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0}; | ||||
|     z4 = vec_absd(z4, z4); | ||||
|     return (int)vec_extract(z4, 0); | ||||
| } | ||||
| @ -0,0 +1,14 @@ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| typedef __vector unsigned int v_uint32x4; | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     v_uint32x4 v1 = (v_uint32x4){2, 4, 8, 16}; | ||||
|     v_uint32x4 v2 = (v_uint32x4){2, 2, 2, 2}; | ||||
|     v_uint32x4 v3 = vec_mod(v1, v2); | ||||
|     return (int)vec_extractm(v3); | ||||
| } | ||||
							
								
								
									
										16
									
								
								lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| #if (__VEC__ < 10301) || (__ARCH__ < 11) | ||||
|     #error VX not supported | ||||
| #endif | ||||
|  | ||||
| #include <vecintrin.h> | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __vector double x = vec_abs(vec_xl(argc, (double*)argv)); | ||||
|     __vector double y = vec_load_len((double*)argv, (unsigned int)argc); | ||||
|  | ||||
|     x = vec_round(vec_ceil(x) + vec_floor(y)); | ||||
|     __vector bool long long m = vec_cmpge(x, y); | ||||
|     __vector long long i = vec_signed(vec_sel(x, y, m)); | ||||
|  | ||||
|     return (int)vec_extract(i, 0); | ||||
| } | ||||
| @ -0,0 +1,25 @@ | ||||
| #if (__VEC__ < 10302) || (__ARCH__ < 12) | ||||
|     #error VXE not supported | ||||
| #endif | ||||
|  | ||||
| #include <vecintrin.h> | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     __vector float x = vec_nabs(vec_xl(argc, (float*)argv)); | ||||
|     __vector float y = vec_load_len((float*)argv, (unsigned int)argc); | ||||
|      | ||||
|     x = vec_round(vec_ceil(x) + vec_floor(y)); | ||||
|     __vector bool int m = vec_cmpge(x, y); | ||||
|     x = vec_sel(x, y, m); | ||||
|  | ||||
|     // need to test the existence of intrin "vflls" since vec_doublee | ||||
|     // is vec_doublee maps to wrong intrin "vfll". | ||||
|     // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871 | ||||
| #if defined(__GNUC__) && !defined(__clang__) | ||||
|     __vector long long i = vec_signed(__builtin_s390_vflls(x)); | ||||
| #else | ||||
|     __vector long long i = vec_signed(vec_doublee(x)); | ||||
| #endif | ||||
|  | ||||
|     return (int)vec_extract(i, 0); | ||||
| } | ||||
| @ -0,0 +1,21 @@ | ||||
| #if (__VEC__ < 10303) || (__ARCH__ < 13) | ||||
|     #error VXE2 not supported | ||||
| #endif | ||||
|  | ||||
| #include <vecintrin.h> | ||||
|  | ||||
| int main(int argc, char **argv) | ||||
| { | ||||
|     int val; | ||||
|     __vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' }; | ||||
|     __vector signed short search = { 'g', 'h', 'g', 'o' }; | ||||
|     __vector unsigned char len = { 0 }; | ||||
|     __vector unsigned char res = vec_search_string_cc(large, search, len, &val); | ||||
|     __vector float x = vec_xl(argc, (float*)argv); | ||||
|     __vector int i = vec_signed(x); | ||||
|  | ||||
|     i = vec_srdb(vec_sldb(i, i, 2), i, 3); | ||||
|     val += (int)vec_extract(res, 1); | ||||
|     val += vec_extract(i, 0); | ||||
|     return val; | ||||
| } | ||||
| @ -0,0 +1,12 @@ | ||||
| #include <immintrin.h> | ||||
| #ifdef _MSC_VER | ||||
|     #include <ammintrin.h> | ||||
| #else | ||||
|     #include <x86intrin.h> | ||||
| #endif | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128()); | ||||
|     return _mm_cvtsi128_si32(a); | ||||
| } | ||||
| @ -0,0 +1,18 @@ | ||||
| #include <immintrin.h> | ||||
| /** | ||||
|  * Test BW mask operations due to: | ||||
|  *  - MSVC has supported it since vs2019 see, | ||||
|  *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html | ||||
|  *  - Clang >= v8.0 | ||||
|  *  - GCC >= v7.1 | ||||
|  */ | ||||
| int main(void) | ||||
| { | ||||
|     __mmask64 m64 = _mm512_cmpeq_epi8_mask(_mm512_set1_epi8((char)1), _mm512_set1_epi8((char)1)); | ||||
|     m64 = _kor_mask64(m64, m64); | ||||
|     m64 = _kxor_mask64(m64, m64); | ||||
|     m64 = _cvtu64_mask64(_cvtmask64_u64(m64)); | ||||
|     m64 = _mm512_kunpackd(m64, m64); | ||||
|     m64 = (__mmask64)_mm512_kunpackw((__mmask32)m64, (__mmask32)m64); | ||||
|     return (int)_cvtmask64_u64(m64); | ||||
| } | ||||
| @ -0,0 +1,16 @@ | ||||
| #include <immintrin.h> | ||||
| /** | ||||
|  * Test DQ mask operations due to: | ||||
|  *  - MSVC has supported it since vs2019 see, | ||||
|  *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html | ||||
|  *  - Clang >= v8.0 | ||||
|  *  - GCC >= v7.1 | ||||
|  */ | ||||
| int main(void) | ||||
| { | ||||
|     __mmask8 m8 = _mm512_cmpeq_epi64_mask(_mm512_set1_epi64(1), _mm512_set1_epi64(1)); | ||||
|     m8 = _kor_mask8(m8, m8); | ||||
|     m8 = _kxor_mask8(m8, m8); | ||||
|     m8 = _cvtu32_mask8(_cvtmask8_u32(m8)); | ||||
|     return (int)_cvtmask8_u32(m8); | ||||
| } | ||||
| @ -0,0 +1,41 @@ | ||||
| #include <immintrin.h> | ||||
| /** | ||||
|  * The following intrinsics don't have direct native support but compilers | ||||
|  * tend to emulate them. | ||||
|  * They're usually supported by gcc >= 7.1, clang >= 4 and icc >= 19 | ||||
|  */ | ||||
| int main(void) | ||||
| { | ||||
|     __m512  one_ps = _mm512_set1_ps(1.0f); | ||||
|     __m512d one_pd = _mm512_set1_pd(1.0); | ||||
|     __m512i one_i64 = _mm512_set1_epi64(1); | ||||
|     // add | ||||
|     float sum_ps  = _mm512_reduce_add_ps(one_ps); | ||||
|     double sum_pd = _mm512_reduce_add_pd(one_pd); | ||||
|     int sum_int   = (int)_mm512_reduce_add_epi64(one_i64); | ||||
|         sum_int  += (int)_mm512_reduce_add_epi32(one_i64); | ||||
|     // mul | ||||
|     sum_ps  += _mm512_reduce_mul_ps(one_ps); | ||||
|     sum_pd  += _mm512_reduce_mul_pd(one_pd); | ||||
|     sum_int += (int)_mm512_reduce_mul_epi64(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_mul_epi32(one_i64); | ||||
|     // min | ||||
|     sum_ps  += _mm512_reduce_min_ps(one_ps); | ||||
|     sum_pd  += _mm512_reduce_min_pd(one_pd); | ||||
|     sum_int += (int)_mm512_reduce_min_epi32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_min_epu32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_min_epi64(one_i64); | ||||
|     // max | ||||
|     sum_ps  += _mm512_reduce_max_ps(one_ps); | ||||
|     sum_pd  += _mm512_reduce_max_pd(one_pd); | ||||
|     sum_int += (int)_mm512_reduce_max_epi32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_max_epu32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_max_epi64(one_i64); | ||||
|     // and | ||||
|     sum_int += (int)_mm512_reduce_and_epi32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_and_epi64(one_i64); | ||||
|     // or | ||||
|     sum_int += (int)_mm512_reduce_or_epi32(one_i64); | ||||
|     sum_int += (int)_mm512_reduce_or_epi64(one_i64); | ||||
|     return (int)sum_ps + (int)sum_pd + sum_int; | ||||
| } | ||||
| @ -0,0 +1,12 @@ | ||||
| /** | ||||
|  * Assembler may not fully support the following VSX3 scalar | ||||
|  * instructions, even though compilers report VSX3 support. | ||||
|  */ | ||||
| int main(void) | ||||
| { | ||||
|     unsigned short bits = 0xFF; | ||||
|     double f; | ||||
|     __asm__ __volatile__("xscvhpdp %x0,%x1" : "=wa"(f) : "wa"(bits)); | ||||
|     __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits) : "wa" (f)); | ||||
|     return bits; | ||||
| } | ||||
| @ -0,0 +1,21 @@ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| typedef __vector float fv4sf_t; | ||||
| typedef __vector unsigned char vec_t; | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     __vector_quad acc0; | ||||
|     float a[4] = {0,1,2,3}; | ||||
|     float b[4] = {0,1,2,3}; | ||||
|     vec_t *va = (vec_t *) a; | ||||
|     vec_t *vb = (vec_t *) b; | ||||
|     __builtin_mma_xvf32ger(&acc0, va[0], vb[0]); | ||||
|     fv4sf_t result[4]; | ||||
|     __builtin_mma_disassemble_acc((void *)result, &acc0); | ||||
|     fv4sf_t c0 = result[0]; | ||||
|     return (int)((float*)&c0)[0]; | ||||
| } | ||||
| @ -0,0 +1,36 @@ | ||||
| /** | ||||
|  * Testing ASM VSX register number fixer '%x<n>' | ||||
|  * | ||||
|  * old versions of CLANG doesn't support %x<n> in the inline asm template | ||||
|  * which fixes register number when using any of the register constraints wa, wd, wf. | ||||
|  * | ||||
|  * xref: | ||||
|  * - https://bugs.llvm.org/show_bug.cgi?id=31837 | ||||
|  * - https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html | ||||
|  */ | ||||
| #ifndef __VSX__ | ||||
|     #error "VSX is not supported" | ||||
| #endif | ||||
| #include <altivec.h> | ||||
|  | ||||
| #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) | ||||
|     #define vsx_ld  vec_vsx_ld | ||||
|     #define vsx_st  vec_vsx_st | ||||
| #else | ||||
|     #define vsx_ld  vec_xl | ||||
|     #define vsx_st  vec_xst | ||||
| #endif | ||||
|  | ||||
| int main(void) | ||||
| { | ||||
|     float z4[] = {0, 0, 0, 0}; | ||||
|     signed int zout[] = {0, 0, 0, 0}; | ||||
|  | ||||
|     __vector float vz4 = vsx_ld(0, z4); | ||||
|     __vector signed int asm_ret = vsx_ld(0, zout); | ||||
|  | ||||
|     __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (vz4) : "wa" (asm_ret)); | ||||
|  | ||||
|     vsx_st(asm_ret, 0, zout); | ||||
|     return zout[0]; | ||||
| } | ||||
| @ -0,0 +1 @@ | ||||
| int test_flags; | ||||
		Reference in New Issue
	
	Block a user