done

2025-09-07 22:09:54 +02:00
parent e1b817252c
commit 2fc0d000b6
7796 changed files with 2159515 additions and 933 deletions
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimd.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimd.c
@ -0,0 +1,27 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
+    /* MAXMIN */
+    int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
+        ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
+    /* ROUNDING */
+    ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
+#ifdef __aarch64__
+    {
+        double *src2 = (double*)argv[argc-1];
+        float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
+        /* MAXMIN */
+        ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
+        ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
+        /* ROUNDING */
+        ret += (int)vgetq_lane_f64(vrndq_f64(vd1), 0);
+    }
+#endif
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimddp.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimddp.c
@ -0,0 +1,16 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    unsigned char *src = (unsigned char*)argv[argc-1];
+    uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]);
+    uint32x4_t va = vdupq_n_u32(3);
+    int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
+#ifdef __aarch64__
+    ret += (int)vgetq_lane_u32(vdotq_laneq_u32(va, v1, v2, 0), 0);
+#endif
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimdfhm.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimdfhm.c
@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    float16_t *src = (float16_t*)argv[argc-1];
+    float *src2 = (float*)argv[argc-2];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16(src[1]);
+    float32x4_t vf   = vdupq_n_f32(src2[0]);
+    float32x2_t vlf  = vdup_n_f32(src2[1]);
+
+    int ret  = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0);
+        ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0);
+
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimdhp.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_asimdhp.c
@ -0,0 +1,15 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    float16_t *src = (float16_t*)argv[argc-1];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16(src[1]);
+
+    int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
+        ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX__
+        #error "HOST/ARCH doesn't support AVX"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_add_ps(_mm256_loadu_ps((const float*)argv[argc-1]), _mm256_loadu_ps((const float*)argv[1]));
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx2.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx2.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX2__
+        #error "HOST/ARCH doesn't support AVX2"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256i a = _mm256_abs_epi16(_mm256_loadu_si256((const __m256i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_clx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_clx.c
@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512VNNI__
+        #error "HOST/ARCH doesn't support CascadeLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    /* VNNI */
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+            a = _mm512_dpbusd_epi32(a, _mm512_setzero_si512(), a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_cnl.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_cnl.c
@ -0,0 +1,24 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VBMI__) || !defined(__AVX512IFMA__)
+        #error "HOST/ARCH doesn't support CannonLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    /* IFMA */
+    a = _mm512_madd52hi_epu64(a, a, _mm512_setzero_si512());
+    /* VMBI */
+    a = _mm512_permutex2var_epi8(a, _mm512_setzero_si512(), a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_icl.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_icl.c
@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VPOPCNTDQ__) || !defined(__AVX512BITALG__) || !defined(__AVX512VPOPCNTDQ__)
+        #error "HOST/ARCH doesn't support IceLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    /* VBMI2 */
+    a = _mm512_shrdv_epi64(a, a, _mm512_setzero_si512());
+    /* BITLAG */
+    a = _mm512_popcnt_epi8(a);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_knl.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_knl.c
@ -0,0 +1,25 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512ER__) || !defined(__AVX512PF__)
+        #error "HOST/ARCH doesn't support Knights Landing AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    int base[128]={};
+    __m512d ad = _mm512_loadu_pd((const __m512d*)argv[argc-1]);
+    /* ER */
+    __m512i a = _mm512_castpd_si512(_mm512_exp2a23_pd(ad));
+    /* PF */
+    _mm512_mask_prefetch_i64scatter_pd(base, _mm512_cmpeq_epi64_mask(a, a), a, 1, _MM_HINT_T1);
+    return base[0];
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_knm.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_knm.c
@ -0,0 +1,30 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX5124FMAPS__) || !defined(__AVX5124VNNIW__) || !defined(__AVX512VPOPCNTDQ__)
+        #error "HOST/ARCH doesn't support Knights Mill AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_loadu_si512((const __m512i*)argv[argc-1]);
+    __m512 b = _mm512_loadu_ps((const __m512*)argv[argc-2]);
+
+    /* 4FMAPS */
+    b = _mm512_4fmadd_ps(b, b, b, b, b, NULL);
+    /* 4VNNIW */
+    a = _mm512_4dpwssd_epi32(a, a, a, a, a, NULL);
+    /* VPOPCNTDQ */
+    a = _mm512_popcnt_epi64(a);
+
+    a = _mm512_add_epi32(a, _mm512_castps_si512(b));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_skx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_skx.c
@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512VL__) || !defined(__AVX512BW__) || !defined(__AVX512DQ__)
+        #error "HOST/ARCH doesn't support SkyLake AVX512 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i aa = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    /* VL */
+    __m256i a = _mm256_abs_epi64(_mm512_extracti64x4_epi64(aa, 1));
+    /* DQ */
+    __m512i b = _mm512_broadcast_i32x8(a);
+    /* BW */
+    b = _mm512_abs_epi16(b);
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(b));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_spr.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512_spr.c
@ -0,0 +1,26 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__AVX512FP16__)
+        #error "HOST/ARCH doesn't support Sapphire Rapids AVX512FP16 features"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+/* clang has a bug regarding our spr coode, see gh-23730. */
+#if __clang__
+#error
+#endif
+    __m512h a = _mm512_loadu_ph((void*)argv[argc-1]);
+    __m512h temp = _mm512_fmadd_ph(a, a, a);
+    _mm512_storeu_ph((void*)(argv[argc-1]), temp);
+    return 0;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512cd.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512cd.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512CD__
+        #error "HOST/ARCH doesn't support AVX512CD"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_lzcnt_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512f.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_avx512f.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __AVX512F__
+        #error "HOST/ARCH doesn't support AVX512F"
+    #endif
+#endif
+
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m512i a = _mm512_abs_epi32(_mm512_loadu_si512((const __m512i*)argv[argc-1]));
+    return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_f16c.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_f16c.c
@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __F16C__
+        #error "HOST/ARCH doesn't support F16C"
+    #endif
+#endif
+
+#include <emmintrin.h>
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m128 a  = _mm_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-1]));
+    __m256 a8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)argv[argc-2]));
+    return (int)(_mm_cvtss_f32(a) + _mm_cvtss_f32(_mm256_castps256_ps128(a8)));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_fma3.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_fma3.c
@ -0,0 +1,22 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__FMA__) && !defined(__AVX2__)
+        #error "HOST/ARCH doesn't support FMA3"
+    #endif
+#endif
+
+#include <xmmintrin.h>
+#include <immintrin.h>
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
+           a = _mm256_fmadd_ps(a, a, a);
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_fma4.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_fma4.c
@ -0,0 +1,13 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(int argc, char **argv)
+{
+    __m256 a = _mm256_loadu_ps((const float*)argv[argc-1]);
+           a = _mm256_macc_ps(a, a, a);
+    return (int)_mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_lsx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_lsx.c
@ -0,0 +1,11 @@
+#ifndef __loongarch_sx
+#error "HOST/ARCH doesn't support LSX"
+#endif
+
+#include <lsxintrin.h>
+
+int main(void)
+{
+    __m128i a = __lsx_vadd_d(__lsx_vldi(0), __lsx_vldi(0));
+    return __lsx_vpickve2gr_w(a, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon.c
@ -0,0 +1,19 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    // passing from untraced pointers to avoid optimizing out any constants
+    // so we can test against the linker.
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
+    int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
+#ifdef __aarch64__
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
+    ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
+#endif
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon_fp16.c
@ -0,0 +1,11 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    short *src = (short*)argv[argc-1];
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
+    return (int)vgetq_lane_f32(v_z4, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon_vfpv4.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_neon_vfpv4.c
@ -0,0 +1,21 @@
+#ifdef _MSC_VER
+    #include <Intrin.h>
+#endif
+#include <arm_neon.h>
+
+int main(int argc, char **argv)
+{
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]);
+    float32x4_t v2 = vdupq_n_f32(src[1]);
+    float32x4_t v3 = vdupq_n_f32(src[2]);
+    int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
+#ifdef __aarch64__
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]);
+    float64x2_t vd2 = vdupq_n_f64(src2[1]);
+    float64x2_t vd3 = vdupq_n_f64(src2[2]);
+    ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
+#endif
+    return ret;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_popcnt.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_popcnt.c
@ -0,0 +1,32 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env vr `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #if !defined(__SSE4_2__) && !defined(__POPCNT__)
+        #error "HOST/ARCH doesn't support POPCNT"
+    #endif
+#endif
+
+#ifdef _MSC_VER
+    #include <nmmintrin.h>
+#else
+    #include <popcntintrin.h>
+#endif
+
+int main(int argc, char **argv)
+{
+    // To make sure popcnt instructions are generated
+    // and been tested against the assembler
+    unsigned long long a = *((unsigned long long*)argv[argc-1]);
+    unsigned int b = *((unsigned int*)argv[argc-2]);
+
+#if defined(_M_X64) || defined(__x86_64__)
+    a = _mm_popcnt_u64(a);
+#endif
+    b = _mm_popcnt_u32(b);
+    return (int)a + b;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_rvv.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_rvv.c
@ -0,0 +1,13 @@
+#ifndef __riscv_vector
+  #error RVV not supported
+#endif
+
+#include <riscv_vector.h>
+
+int main(void)
+{
+    size_t vlmax = __riscv_vsetvlmax_e32m1();
+    vuint32m1_t a = __riscv_vmv_v_x_u32m1(0, vlmax);
+    vuint32m1_t b = __riscv_vadd_vv_u32m1(a, a, vlmax);
+    return __riscv_vmv_x_s_u32m1_u32(b);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE__
+        #error "HOST/ARCH doesn't support SSE"
+    #endif
+#endif
+
+#include <xmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_add_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse2.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse2.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE2__
+        #error "HOST/ARCH doesn't support SSE2"
+    #endif
+#endif
+
+#include <emmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_add_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse3.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse3.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE3__
+        #error "HOST/ARCH doesn't support SSE3"
+    #endif
+#endif
+
+#include <pmmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse41.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse41.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE4_1__
+        #error "HOST/ARCH doesn't support SSE41"
+    #endif
+#endif
+
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_floor_ps(_mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse42.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sse42.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSE4_2__
+        #error "HOST/ARCH doesn't support SSE42"
+    #endif
+#endif
+
+#include <smmintrin.h>
+
+int main(void)
+{
+    __m128 a = _mm_hadd_ps(_mm_setzero_ps(), _mm_setzero_ps());
+    return (int)_mm_cvtss_f32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_ssse3.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_ssse3.c
@ -0,0 +1,20 @@
+#if defined(DETECT_FEATURES) && defined(__INTEL_COMPILER)
+    /*
+     * Unlike GCC and CLANG, Intel Compiler exposes all supported intrinsics,
+     * whether or not the build options for those features are specified.
+     * Therefore, we must test #definitions of CPU features when option native/host
+     * is enabled via `--cpu-baseline` or through env var `CFLAGS` otherwise
+     * the test will be broken and leads to enable all possible features.
+     */
+    #ifndef __SSSE3__
+        #error "HOST/ARCH doesn't support SSSE3"
+    #endif
+#endif
+
+#include <tmmintrin.h>
+
+int main(void)
+{
+    __m128i a = _mm_hadd_epi16(_mm_setzero_si128(), _mm_setzero_si128());
+    return (int)_mm_cvtsi128_si32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sve.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_sve.c
@ -0,0 +1,14 @@
+#include <arm_sve.h>
+
+int accumulate(svint64_t a, svint64_t b) {
+    svbool_t p = svptrue_b64();
+    return svaddv(p, svmla_z(p, a, a, b));
+}
+
+int main(void)
+{
+    svbool_t p = svptrue_b64();
+    svint64_t a = svdup_s64(1);
+    svint64_t b = svdup_s64(2);
+    return accumulate(a, b);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx.c
@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    unsigned int zout[4];
+    unsigned int z4[] = {0, 0, 0, 0};
+    __vector unsigned int v_z4 = vsx_ld(0, z4);
+    vsx_st(v_z4, 0, zout);
+    return zout[0];
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx2.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx2.c
@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned long long v_uint64x2;
+
+int main(void)
+{
+    v_uint64x2 z2 = (v_uint64x2){0, 0};
+    z2 = (v_uint64x2)vec_cmpeq(z2, z2);
+    return (int)vec_extract(z2, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx3.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx3.c
@ -0,0 +1,13 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 z4 = (v_uint32x4){0, 0, 0, 0};
+    z4 = vec_absd(z4, z4);
+    return (int)vec_extract(z4, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx4.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vsx4.c
@ -0,0 +1,14 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector unsigned int v_uint32x4;
+
+int main(void)
+{
+    v_uint32x4 v1 = (v_uint32x4){2, 4, 8, 16};
+    v_uint32x4 v2 = (v_uint32x4){2, 2, 2, 2};
+    v_uint32x4 v3 = vec_mod(v1, v2);
+    return (int)vec_extractm(v3);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vx.c
@ -0,0 +1,16 @@
+#if (__VEC__ < 10301) || (__ARCH__ < 11)
+    #error VX not supported
+#endif
+
+#include <vecintrin.h>
+int main(int argc, char **argv)
+{
+    __vector double x = vec_abs(vec_xl(argc, (double*)argv));
+    __vector double y = vec_load_len((double*)argv, (unsigned int)argc);
+
+    x = vec_round(vec_ceil(x) + vec_floor(y));
+    __vector bool long long m = vec_cmpge(x, y);
+    __vector long long i = vec_signed(vec_sel(x, y, m));
+
+    return (int)vec_extract(i, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vxe.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vxe.c
@ -0,0 +1,25 @@
+#if (__VEC__ < 10302) || (__ARCH__ < 12)
+    #error VXE not supported
+#endif
+
+#include <vecintrin.h>
+int main(int argc, char **argv)
+{
+    __vector float x = vec_nabs(vec_xl(argc, (float*)argv));
+    __vector float y = vec_load_len((float*)argv, (unsigned int)argc);
+    
+    x = vec_round(vec_ceil(x) + vec_floor(y));
+    __vector bool int m = vec_cmpge(x, y);
+    x = vec_sel(x, y, m);
+
+    // need to test the existence of intrin "vflls" since vec_doublee
+    // is vec_doublee maps to wrong intrin "vfll".
+    // see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100871
+#if defined(__GNUC__) && !defined(__clang__)
+    __vector long long i = vec_signed(__builtin_s390_vflls(x));
+#else
+    __vector long long i = vec_signed(vec_doublee(x));
+#endif
+
+    return (int)vec_extract(i, 0);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vxe2.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_vxe2.c
@ -0,0 +1,21 @@
+#if (__VEC__ < 10303) || (__ARCH__ < 13)
+    #error VXE2 not supported
+#endif
+
+#include <vecintrin.h>
+
+int main(int argc, char **argv)
+{
+    int val;
+    __vector signed short large = { 'a', 'b', 'c', 'a', 'g', 'h', 'g', 'o' };
+    __vector signed short search = { 'g', 'h', 'g', 'o' };
+    __vector unsigned char len = { 0 };
+    __vector unsigned char res = vec_search_string_cc(large, search, len, &val);
+    __vector float x = vec_xl(argc, (float*)argv);
+    __vector int i = vec_signed(x);
+
+    i = vec_srdb(vec_sldb(i, i, 2), i, 3);
+    val += (int)vec_extract(res, 1);
+    val += vec_extract(i, 0);
+    return val;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/cpu_xop.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/cpu_xop.c
@ -0,0 +1,12 @@
+#include <immintrin.h>
+#ifdef _MSC_VER
+    #include <ammintrin.h>
+#else
+    #include <x86intrin.h>
+#endif
+
+int main(void)
+{
+    __m128i a = _mm_comge_epu32(_mm_setzero_si128(), _mm_setzero_si128());
+    return _mm_cvtsi128_si32(a);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512bw_mask.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512bw_mask.c
@ -0,0 +1,18 @@
+#include <immintrin.h>
+/**
+ * Test BW mask operations due to:
+ *  - MSVC has supported it since vs2019 see,
+ *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
+ *  - Clang >= v8.0
+ *  - GCC >= v7.1
+ */
+int main(void)
+{
+    __mmask64 m64 = _mm512_cmpeq_epi8_mask(_mm512_set1_epi8((char)1), _mm512_set1_epi8((char)1));
+    m64 = _kor_mask64(m64, m64);
+    m64 = _kxor_mask64(m64, m64);
+    m64 = _cvtu64_mask64(_cvtmask64_u64(m64));
+    m64 = _mm512_kunpackd(m64, m64);
+    m64 = (__mmask64)_mm512_kunpackw((__mmask32)m64, (__mmask32)m64);
+    return (int)_cvtmask64_u64(m64);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512dq_mask.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512dq_mask.c
@ -0,0 +1,16 @@
+#include <immintrin.h>
+/**
+ * Test DQ mask operations due to:
+ *  - MSVC has supported it since vs2019 see,
+ *    https://developercommunity.visualstudio.com/content/problem/518298/missing-avx512bw-mask-intrinsics.html
+ *  - Clang >= v8.0
+ *  - GCC >= v7.1
+ */
+int main(void)
+{
+    __mmask8 m8 = _mm512_cmpeq_epi64_mask(_mm512_set1_epi64(1), _mm512_set1_epi64(1));
+    m8 = _kor_mask8(m8, m8);
+    m8 = _kxor_mask8(m8, m8);
+    m8 = _cvtu32_mask8(_cvtmask8_u32(m8));
+    return (int)_cvtmask8_u32(m8);
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512f_reduce.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_avx512f_reduce.c
@ -0,0 +1,41 @@
+#include <immintrin.h>
+/**
+ * The following intrinsics don't have direct native support but compilers
+ * tend to emulate them.
+ * They're usually supported by gcc >= 7.1, clang >= 4 and icc >= 19
+ */
+int main(void)
+{
+    __m512  one_ps = _mm512_set1_ps(1.0f);
+    __m512d one_pd = _mm512_set1_pd(1.0);
+    __m512i one_i64 = _mm512_set1_epi64(1);
+    // add
+    float sum_ps  = _mm512_reduce_add_ps(one_ps);
+    double sum_pd = _mm512_reduce_add_pd(one_pd);
+    int sum_int   = (int)_mm512_reduce_add_epi64(one_i64);
+        sum_int  += (int)_mm512_reduce_add_epi32(one_i64);
+    // mul
+    sum_ps  += _mm512_reduce_mul_ps(one_ps);
+    sum_pd  += _mm512_reduce_mul_pd(one_pd);
+    sum_int += (int)_mm512_reduce_mul_epi64(one_i64);
+    sum_int += (int)_mm512_reduce_mul_epi32(one_i64);
+    // min
+    sum_ps  += _mm512_reduce_min_ps(one_ps);
+    sum_pd  += _mm512_reduce_min_pd(one_pd);
+    sum_int += (int)_mm512_reduce_min_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_min_epi64(one_i64);
+    // max
+    sum_ps  += _mm512_reduce_max_ps(one_ps);
+    sum_pd  += _mm512_reduce_max_pd(one_pd);
+    sum_int += (int)_mm512_reduce_max_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epu32(one_i64);
+    sum_int += (int)_mm512_reduce_max_epi64(one_i64);
+    // and
+    sum_int += (int)_mm512_reduce_and_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_and_epi64(one_i64);
+    // or
+    sum_int += (int)_mm512_reduce_or_epi32(one_i64);
+    sum_int += (int)_mm512_reduce_or_epi64(one_i64);
+    return (int)sum_ps + (int)sum_pd + sum_int;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx3_half_double.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx3_half_double.c
@ -0,0 +1,12 @@
+/**
+ * Assembler may not fully support the following VSX3 scalar
+ * instructions, even though compilers report VSX3 support.
+ */
+int main(void)
+{
+    unsigned short bits = 0xFF;
+    double f;
+    __asm__ __volatile__("xscvhpdp %x0,%x1" : "=wa"(f) : "wa"(bits));
+    __asm__ __volatile__ ("xscvdphp %x0,%x1" : "=wa" (bits) : "wa" (f));
+    return bits;
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx4_mma.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx4_mma.c
@ -0,0 +1,21 @@
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+typedef __vector float fv4sf_t;
+typedef __vector unsigned char vec_t;
+
+int main(void)
+{
+    __vector_quad acc0;
+    float a[4] = {0,1,2,3};
+    float b[4] = {0,1,2,3};
+    vec_t *va = (vec_t *) a;
+    vec_t *vb = (vec_t *) b;
+    __builtin_mma_xvf32ger(&acc0, va[0], vb[0]);
+    fv4sf_t result[4];
+    __builtin_mma_disassemble_acc((void *)result, &acc0);
+    fv4sf_t c0 = result[0];
+    return (int)((float*)&c0)[0];
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx_asm.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/extra_vsx_asm.c
@ -0,0 +1,36 @@
+/**
+ * Testing ASM VSX register number fixer '%x<n>'
+ *
+ * old versions of CLANG doesn't support %x<n> in the inline asm template
+ * which fixes register number when using any of the register constraints wa, wd, wf.
+ *
+ * xref:
+ * - https://bugs.llvm.org/show_bug.cgi?id=31837
+ * - https://gcc.gnu.org/onlinedocs/gcc/Machine-Constraints.html
+ */
+#ifndef __VSX__
+    #error "VSX is not supported"
+#endif
+#include <altivec.h>
+
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+    #define vsx_ld  vec_vsx_ld
+    #define vsx_st  vec_vsx_st
+#else
+    #define vsx_ld  vec_xl
+    #define vsx_st  vec_xst
+#endif
+
+int main(void)
+{
+    float z4[] = {0, 0, 0, 0};
+    signed int zout[] = {0, 0, 0, 0};
+
+    __vector float vz4 = vsx_ld(0, z4);
+    __vector signed int asm_ret = vsx_ld(0, zout);
+
+    __asm__ ("xvcvspsxws %x0,%x1" : "=wa" (vz4) : "wa" (asm_ret));
+
+    vsx_st(asm_ret, 0, zout);
+    return zout[0];
+}
--- a/lib/python3.11/site-packages/numpy/distutils/checks/test_flags.c
+++ b/lib/python3.11/site-packages/numpy/distutils/checks/test_flags.c
@ -0,0 +1 @@
+int test_flags;