--- msolve-0.5.0/m4/ax_ext.m4.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/m4/ax_ext.m4 2023-07-28 10:50:28.215653884 -0600 @@ -55,274 +55,12 @@ AC_DEFUN([AX_EXT], SIMD_FLAGS="" case $host_cpu in - powerpc*) - AC_CACHE_CHECK([whether altivec is supported for old distros], [ax_cv_have_altivec_old_ext], - [ - if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then - if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then - ax_cv_have_altivec_old_ext=yes - fi - fi - ]) - - if test "$ax_cv_have_altivec_old_ext" = yes; then - AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions]) - AX_CHECK_COMPILE_FLAG(-faltivec, SIMD_FLAGS="$SIMD_FLAGS -faltivec", []) - fi - - AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext], - [ - if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c altivec` != 0; then - ax_cv_have_altivec_ext=yes - fi - ]) - - if test "$ax_cv_have_altivec_ext" = yes; then - AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions]) - AX_CHECK_COMPILE_FLAG(-maltivec, SIMD_FLAGS="$SIMD_FLAGS -maltivec", []) - fi - - AC_CACHE_CHECK([whether vsx is supported], [ax_cv_have_vsx_ext], - [ - if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c vsx` != 0; then - ax_cv_have_vsx_ext=yes - fi - ]) - - if test "$ax_cv_have_vsx_ext" = yes; then - AC_DEFINE(HAVE_VSX,,[Support VSX instructions]) - AX_CHECK_COMPILE_FLAG(-mvsx, SIMD_FLAGS="$SIMD_FLAGS -mvsx", []) - fi - ;; - i[[3456]]86*|x86_64*|amd64*) - AC_REQUIRE([AX_GCC_X86_CPUID]) - AC_REQUIRE([AX_GCC_X86_CPUID_COUNT]) - AC_REQUIRE([AX_GCC_X86_AVX_XGETBV]) - - eax_cpuid0=0 - AX_GCC_X86_CPUID(0x00000000) - if test "$ax_cv_gcc_x86_cpuid_0x00000000" != "unknown"; - then - eax_cpuid0=`echo $ax_cv_gcc_x86_cpuid_0x00000000 | cut -d ":" -f 1` - fi - - eax_cpuid80000000=0 - AX_GCC_X86_CPUID(0x80000000) - if test "$ax_cv_gcc_x86_cpuid_0x80000000" != "unknown"; - then - eax_cpuid80000000=`echo $ax_cv_gcc_x86_cpuid_0x80000000 | cut -d ":" -f 1` - fi - - ecx_cpuid1=0 - edx_cpuid1=0 - if test "$((0x$eax_cpuid0))" -ge 1 ; then - AX_GCC_X86_CPUID(0x00000001) - if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown"; - then - ecx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3` - edx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4` - fi - fi - - ebx_cpuid7=0 - ecx_cpuid7=0 - if test "$((0x$eax_cpuid0))" -ge 7 ; then - AX_GCC_X86_CPUID_COUNT(0x00000007, 0x00) - if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown"; - then - ebx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2` - ecx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 3` - fi - fi - - ecx_cpuid80000001=0 - edx_cpuid80000001=0 - if test "$((0x$eax_cpuid80000000))" -ge "$((0x80000001))" ; then - AX_GCC_X86_CPUID(0x80000001) - if test "$ax_cv_gcc_x86_cpuid_0x80000001" != "unknown"; - then - ecx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 3` - edx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 4` - fi - fi - - AC_CACHE_VAL([ax_cv_have_mmx_os_support_ext], - [ - ax_cv_have_mmx_os_support_ext=yes - ]) - - ax_cv_have_none_os_support_ext=yes - - AC_CACHE_VAL([ax_cv_have_sse_os_support_ext], - [ - ax_cv_have_sse_os_support_ext=no, - if test "$((0x$edx_cpuid1>>25&0x01))" = 1; then - AC_LANG_PUSH([C]) - AC_RUN_IFELSE([AC_LANG_SOURCE([[ -#include -#include - /* No way at ring1 to ring3 in protected mode to check the CR0 and CR4 - control registers directly. Execute an SSE instruction. - If it raises SIGILL then OS doesn't support SSE based instructions */ - void sig_handler(int signum){ exit(1); } - int main(){ - signal(SIGILL, sig_handler); - /* SSE instruction xorps %xmm0,%xmm0 */ - __asm__ __volatile__ (".byte 0x0f, 0x57, 0xc0"); - return 0; - }]])], - [ax_cv_have_sse_os_support_ext=yes], - [ax_cv_have_sse_os_support_ext=no], - [ax_cv_have_sse_os_support_ext=no]) - AC_LANG_POP([C]) - fi - ]) - - xgetbv_eax=0 - if test "$((0x$ecx_cpuid1>>28&0x01))" = 1; then - AX_GCC_X86_AVX_XGETBV(0x00000000) - - if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then - xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1` - fi - - AC_CACHE_VAL([ax_cv_have_avx_os_support_ext], - [ - ax_cv_have_avx_os_support_ext=no - if test "$((0x$ecx_cpuid1>>27&0x01))" = 1; then - if test "$((0x$xgetbv_eax&0x6))" = 6; then - ax_cv_have_avx_os_support_ext=yes - fi - fi - ]) - fi - - AC_CACHE_VAL([ax_cv_have_avx512_os_support_ext], - [ - ax_cv_have_avx512_os_support_ext=no - if test "$ax_cv_have_avx_os_support_ext" = yes; then - if test "$((0x$xgetbv_eax&0xe6))" = "$((0xe6))"; then - ax_cv_have_avx512_os_support_ext=yes - fi - fi - ]) - - for ac_instr_info dnl - in "none;rdrnd;RDRND;ecx_cpuid1,30;-mrdrnd;HAVE_RDRND;CPUEXT_FLAGS" dnl - "none;bmi1;BMI1;ebx_cpuid7,3;-mbmi;HAVE_BMI1;CPUEXT_FLAGS" dnl - "none;bmi2;BMI2;ebx_cpuid7,8;-mbmi2;HAVE_BMI2;CPUEXT_FLAGS" dnl - "none;adx;ADX;ebx_cpuid7,19;-madx;HAVE_ADX;CPUEXT_FLAGS" dnl - dnl "none;mpx;MPX;ebx_cpuid7,14;-mmpx;HAVE_MPX;CPUEXT_FLAGS" dnl - "none;prefetchwt1;PREFETCHWT1;ecx_cpuid7,0;-mprefetchwt1;HAVE_PREFETCHWT1;CPUEXT_FLAGS" dnl - "none;abm;ABM;ecx_cpuid80000001,5;-mabm;HAVE_ABM;CPUEXT_FLAGS" dnl - "mmx;mmx;MMX;edx_cpuid1,23;-mmmx;HAVE_MMX;SIMD_FLAGS" dnl - "sse;sse;SSE;edx_cpuid1,25;-msse;HAVE_SSE;SIMD_FLAGS" dnl - "sse;sse2;SSE2;edx_cpuid1,26;-msse2;HAVE_SSE2;SIMD_FLAGS" dnl - "sse;sse3;SSE3;ecx_cpuid1,1;-msse3;HAVE_SSE3;SIMD_FLAGS" dnl - "sse;ssse3;SSSE3;ecx_cpuid1,9;-mssse3;HAVE_SSSE3;SIMD_FLAGS" dnl - "sse;sse41;SSE4.1;ecx_cpuid1,19;-msse4.1;HAVE_SSE4_1;SIMD_FLAGS" dnl - "sse;sse42;SSE4.2;ecx_cpuid1,20;-msse4.2;HAVE_SSE4_2;SIMD_FLAGS" dnl - "sse;sse4a;SSE4a;ecx_cpuid80000001,6;-msse4a;HAVE_SSE4a;SIMD_FLAGS" dnl - "sse;sha;SHA;ebx_cpuid7,29;-msha;HAVE_SHA;SIMD_FLAGS" dnl - "sse;aes;AES;ecx_cpuid1,25;-maes;HAVE_AES;SIMD_FLAGS" dnl - "avx;avx;AVX;ecx_cpuid1,28;-mavx;HAVE_AVX;SIMD_FLAGS" dnl - "avx;fma3;FMA3;ecx_cpuid1,12;-mfma;HAVE_FMA3;SIMD_FLAGS" dnl - "avx;fma4;FMA4;ecx_cpuid80000001,16;-mfma4;HAVE_FMA4;SIMD_FLAGS" dnl - "avx;xop;XOP;ecx_cpuid80000001,11;-mxop;HAVE_XOP;SIMD_FLAGS" dnl - "avx;avx2;AVX2;ebx_cpuid7,5;-mavx2;HAVE_AVX2;SIMD_FLAGS" dnl - "avx512;avx512f;AVX512-F;ebx_cpuid7,16;-mavx512f;HAVE_AVX512_F;SIMD_FLAGS" dnl - "avx512;avx512cd;AVX512-CD;ebx_cpuid7,28;-mavx512cd;HAVE_AVX512_CD;SIMD_FLAGS" dnl - "avx512;avx512pf;AVX512-PF;ebx_cpuid7,26;-mavx512pf;HAVE_AVX512_PF;SIMD_FLAGS" dnl - "avx512;avx512er;AVX512-ER;ebx_cpuid7,27;-mavx512er;HAVE_AVX512_ER;SIMD_FLAGS" dnl - "avx512;avx512vl;AVX512-VL;ebx_cpuid7,31;-mavx512vl;HAVE_AVX512_VL;SIMD_FLAGS" dnl - "avx512;avx512bw;AVX512-BW;ebx_cpuid7,30;-mavx512bw;HAVE_AVX512_BW;SIMD_FLAGS" dnl - "avx512;avx512dq;AVX512-DQ;ebx_cpuid7,17;-mavx512dq;HAVE_AVX512_DQ;SIMD_FLAGS" dnl - "avx512;avx512ifma;AVX512-IFMA;ebx_cpuid7,21;-mavx512ifma;HAVE_AVX512_IFMA;SIMD_FLAGS" dnl - "avx512;avx512vbmi;AVX512-VBMI;ecx_cpuid7,1;-mavx512vbmi;HAVE_AVX512_VBMI;SIMD_FLAGS" dnl - # - do ac_instr_os_support=$(eval echo \$ax_cv_have_$(echo $ac_instr_info | cut -d ";" -f 1)_os_support_ext) - ac_instr_acvar=$(echo $ac_instr_info | cut -d ";" -f 2) - ac_instr_shortname=$(echo $ac_instr_info | cut -d ";" -f 3) - ac_instr_chk_loc=$(echo $ac_instr_info | cut -d ";" -f 4) - ac_instr_chk_reg=0x$(eval echo \$$(echo $ac_instr_chk_loc | cut -d "," -f 1)) - ac_instr_chk_bit=$(echo $ac_instr_chk_loc | cut -d "," -f 2) - ac_instr_compiler_flags=$(echo $ac_instr_info | cut -d ";" -f 5) - ac_instr_have_define=$(echo $ac_instr_info | cut -d ";" -f 6) - ac_instr_flag_type=$(echo $ac_instr_info | cut -d ";" -f 7) - - AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor], [ax_cv_have_${ac_instr_acvar}_cpu_ext], - [ - eval ax_cv_have_${ac_instr_acvar}_cpu_ext=no - if test "$((${ac_instr_chk_reg}>>${ac_instr_chk_bit}&0x01))" = 1 ; then - eval ax_cv_have_${ac_instr_acvar}_cpu_ext=yes - fi - ]) - - if test x"$(eval echo \$ax_cv_have_${ac_instr_acvar}_cpu_ext)" = x"yes"; then - AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor and OS], [ax_cv_have_${ac_instr_acvar}_ext], - [ - eval ax_cv_have_${ac_instr_acvar}_ext=no - if test x"${ac_instr_os_support}" = x"yes"; then - eval ax_cv_have_${ac_instr_acvar}_ext=yes - fi - ]) - - if test "$(eval echo \$ax_cv_have_${ac_instr_acvar}_ext)" = yes; then - AX_CHECK_COMPILE_FLAG(${ac_instr_compiler_flags}, eval ax_cv_support_${ac_instr_acvar}_ext=yes, - eval ax_cv_support_${ac_instr_acvar}_ext=no) - if test x"$(eval echo \$ax_cv_support_${ac_instr_acvar}_ext)" = x"yes"; then - eval ${ac_instr_flag_type}=\"\$${ac_instr_flag_type} ${ac_instr_compiler_flags}\" - AC_DEFINE_UNQUOTED([${ac_instr_have_define}]) - else - AC_MSG_WARN([Your processor and OS supports ${ac_instr_shortname} instructions but not your compiler, can you try another compiler?]) - fi - else - if test x"${ac_instr_os_support}" = x"no"; then - AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no) - AC_MSG_WARN([Your processor supports ${ac_instr_shortname}, but your OS doesn't]) - fi - fi - else - AC_CACHE_VAL(ax_cv_have_${ac_instr_acvar}_ext, eval ax_cv_have_${ac_instr_acvar}_ext=no) - AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no) - fi - done + SIMD_FLAGS="-mavx2" ;; esac - AH_TEMPLATE([HAVE_RDRND],[Define to 1 to support Digital Random Number Generator]) - AH_TEMPLATE([HAVE_BMI1],[Define to 1 to support Bit Manipulation Instruction Set 1]) - AH_TEMPLATE([HAVE_BMI2],[Define to 1 to support Bit Manipulation Instruction Set 2]) - AH_TEMPLATE([HAVE_ADX],[Define to 1 to support Multi-Precision Add-Carry Instruction Extensions]) - AH_TEMPLATE([HAVE_MPX],[Define to 1 to support Memory Protection Extensions]) - AH_TEMPLATE([HAVE_PREFETCHWT1],[Define to 1 to support Prefetch Vector Data Into Caches WT1]) - AH_TEMPLATE([HAVE_ABM],[Define to 1 to support Advanced Bit Manipulation]) - AH_TEMPLATE([HAVE_MMX],[Define to 1 to support Multimedia Extensions]) - AH_TEMPLATE([HAVE_SSE],[Define to 1 to support Streaming SIMD Extensions]) - AH_TEMPLATE([HAVE_SSE2],[Define to 1 to support Streaming SIMD Extensions]) - AH_TEMPLATE([HAVE_SSE3],[Define to 1 to support Streaming SIMD Extensions 3]) - AH_TEMPLATE([HAVE_SSSE3],[Define to 1 to support Supplemental Streaming SIMD Extensions 3]) - AH_TEMPLATE([HAVE_SSE4_1],[Define to 1 to support Streaming SIMD Extensions 4.1]) - AH_TEMPLATE([HAVE_SSE4_2],[Define to 1 to support Streaming SIMD Extensions 4.2]) - AH_TEMPLATE([HAVE_SSE4a],[Define to 1 to support AMD Streaming SIMD Extensions 4a]) - AH_TEMPLATE([HAVE_SHA],[Define to 1 to support Secure Hash Algorithm Extension]) - AH_TEMPLATE([HAVE_AES],[Define to 1 to support Advanced Encryption Standard New Instruction Set (AES-NI)]) - AH_TEMPLATE([HAVE_AVX],[Define to 1 to support Advanced Vector Extensions]) - AH_TEMPLATE([HAVE_FMA3],[Define to 1 to support Fused Multiply-Add Extensions 3]) - AH_TEMPLATE([HAVE_FMA4],[Define to 1 to support Fused Multiply-Add Extensions 4]) - AH_TEMPLATE([HAVE_XOP],[Define to 1 to support eXtended Operations Extensions]) - AH_TEMPLATE([HAVE_AVX2],[Define to 1 to support Advanced Vector Extensions 2]) - AH_TEMPLATE([HAVE_AVX512_F],[Define to 1 to support AVX-512 Foundation Extensions]) - AH_TEMPLATE([HAVE_AVX512_CD],[Define to 1 to support AVX-512 Conflict Detection Instructions]) - AH_TEMPLATE([HAVE_AVX512_PF],[Define to 1 to support AVX-512 Conflict Prefetch Instructions]) - AH_TEMPLATE([HAVE_AVX512_ER],[Define to 1 to support AVX-512 Exponential & Reciprocal Instructions]) - AH_TEMPLATE([HAVE_AVX512_VL],[Define to 1 to support AVX-512 Vector Length Extensions]) - AH_TEMPLATE([HAVE_AVX512_BW],[Define to 1 to support AVX-512 Byte and Word Instructions]) - AH_TEMPLATE([HAVE_AVX512_DQ],[Define to 1 to support AVX-512 Doubleword and Quadword Instructions]) - AH_TEMPLATE([HAVE_AVX512_IFMA],[Define to 1 to support AVX-512 Integer Fused Multiply Add Instructions]) - AH_TEMPLATE([HAVE_AVX512_VBMI],[Define to 1 to support AVX-512 Vector Byte Manipulation Instructions]) AC_SUBST(SIMD_FLAGS) AC_SUBST(CPUEXT_FLAGS) ]) --- msolve-0.5.0/src/fglm/fglm_core.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/fglm/fglm_core.c 2023-07-28 14:08:11.775979716 -0600 @@ -41,6 +41,8 @@ double omp_get_wtime(void) { return real #define DEBUGFGLM 0 #define BLOCKWIED 0 +extern _Bool have_avx2; + #include "data_fglm.c" #include "linalg-fglm.c" #include "matrix-mult.c" @@ -449,15 +451,16 @@ static inline void sparse_mat_fglm_mult_ res[mat->triv_idx[i]] = vec[mat->triv_pos[i]]; } /* printf("ncols %u\n", ncols); */ -#ifdef HAVE_AVX2 - /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */ - _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst, - ncols, nrows, prime, RED_32, RED_64, - preinv,st); -#else - non_avx_matrix_vector_product(vres, mat->dense_mat, vec, - ncols, nrows, prime, RED_32, RED_64,st); +#ifdef __amd64__ + if (have_avx2) + /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */ + _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst, + ncols, nrows, prime, RED_32, RED_64, + preinv,st); + else #endif + non_avx_matrix_vector_product(vres, mat->dense_mat, vec, + ncols, nrows, prime, RED_32, RED_64,st); for(szmat_t i = 0; i < nrows; i++){ res[mat->dense_idx[i]] = vres[i]; } @@ -498,17 +501,18 @@ static inline void sparse_mat_fglm_colon } /* printf ("zero\n"); */ /* printf("ncols %u\n", ncols); */ -#ifdef HAVE_AVX2 - /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */ - _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst, - ncols, nrows, prime, RED_32, RED_64, - preinv,st); +#ifdef __amd64__ + if (have_avx2) + /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */ + _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst, + ncols, nrows, prime, RED_32, RED_64, + preinv,st); /* printf ("mul AVX\n"); */ -#else - non_avx_matrix_vector_product(vres, mat->dense_mat, vec, - ncols, nrows, prime, RED_32, RED_64,st); - /* printf ("mul non AVX\n"); */ + else #endif + non_avx_matrix_vector_product(vres, mat->dense_mat, vec, + ncols, nrows, prime, RED_32, RED_64,st); + /* printf ("mul non AVX\n"); */ for(szmat_t i = 0; i < nrows; i++){ res[mat->dense_idx[i]] = vres[i]; } --- msolve-0.5.0/src/fglm/inner-product.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/fglm/inner-product.c 2023-07-27 17:14:03.128200166 -0600 @@ -22,7 +22,7 @@ #include #include -#ifdef HAVE_AVX2 +#ifdef __amd64__ #include #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A)) @@ -257,7 +257,7 @@ static inline void non_avx_matrix_vector } } -#ifdef HAVE_AVX2 +#ifdef __amd64__ static inline void matrix_vector_product(uint32_t* vec_res, const uint32_t* mat, const uint32_t* vec, const uint32_t ncols, const uint32_t nrows, const uint32_t PRIME, --- msolve-0.5.0/src/fglm/linalg-fglm.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/fglm/linalg-fglm.c 2023-07-27 17:14:28.342887244 -0600 @@ -22,7 +22,7 @@ #include #include -#ifdef HAVE_AVX2 +#ifdef __amd64__ #include #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A)) @@ -230,7 +230,7 @@ static inline void non_avx_matrix_vector } } -#ifdef HAVE_AVX2 +#ifdef __amd64__ static inline void matrix_vector_product(uint32_t* vec_res, const uint32_t* mat, const uint32_t* vec, const uint32_t ncols, const uint32_t nrows, const uint32_t PRIME, --- msolve-0.5.0/src/fglm/matrix-mult.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/fglm/matrix-mult.c 2023-07-27 17:16:17.950526960 -0600 @@ -21,7 +21,7 @@ //-1 sur 32 bits #define MONE32 ((uint32_t)0xFFFFFFFF) -#ifdef HAVE_AVX2 +#ifdef __amd64__ #include #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A)) @@ -110,7 +110,7 @@ static inline uint64_t SUBMODRED32(uint6 return (a < b) ? (p + neg) : (neg); } -#if HAVE_AVX2 +#if __amd64__ static inline void REDUCE(uint64_t *acc64, uint64_t *acc4x64, __m256i acc_low, __m256i acc_high, const uint32_t fc, const uint32_t preinv, @@ -165,7 +165,7 @@ static __inline__ void make_zero(uint32_ } -#ifdef HAVE_AVX2 +#ifdef __amd64__ /* m = number of rows in A l = number of cols in A = number of rows in B @@ -283,15 +283,18 @@ static inline void sparse_matfglm_mul(CF /* real product */ -#ifdef HAVE_AVX2 - _mod_mat_addmul_transpose_op(tres, matxn->dense_mat, R, - matxn->nrows, matxn->ncols, nc, - prime, preinv, - RED_32, RED_64); -#else - fprintf(stderr, "Not implemented yet\n"); - exit(1); +#ifdef __amd64__ + if (have_avx2) + _mod_mat_addmul_transpose_op(tres, matxn->dense_mat, R, + matxn->nrows, matxn->ncols, nc, + prime, preinv, + RED_32, RED_64); + else #endif + { + fprintf(stderr, "Not implemented yet\n"); + exit(1); + } for(szmat_t j = 0; j < nrows; j++){ for(int i = 0; i < nc; i++){ --- msolve-0.5.0/src/neogb/data.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/neogb/data.c 2023-07-28 11:18:11.344065251 -0600 @@ -21,6 +21,19 @@ #include "data.h" +/* check for AVX2 availability */ +#ifdef __amd64__ +#include +_Bool have_avx2; + +static void __attribute__((constructor)) +set_avx2_flag(void) +{ + unsigned int eax, ebx, ecx, edx; + have_avx2 = __get_cpuid(7, &eax, &ebx, &ecx, &edx) && (ebx & bit_AVX2) != 0; +} +#endif + /* function pointers */ /* bs_t *(*initialize_basis)( * const int32_t ngens --- msolve-0.5.0/src/neogb/data.h.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/neogb/data.h 2023-07-28 11:17:26.592696306 -0600 @@ -40,6 +40,11 @@ inline omp_int_t omp_get_thread_num(void inline omp_int_t omp_get_max_threads(void) { return 1;} #endif +/* check if AVX2 is available */ +#ifdef __amd64__ +extern _Bool have_avx2; +#endif + #define PARALLEL_HASHING 1 #define ORDER_COLUMNS 1 /* loop unrolling in sparse linear algebra: --- msolve-0.5.0/src/neogb/la_ff_16.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/neogb/la_ff_16.c 2023-07-28 11:18:23.967887228 -0600 @@ -20,7 +20,7 @@ #include "data.h" -#ifdef HAVE_AVX2 +#ifdef __amd64__ #include #endif @@ -122,7 +122,7 @@ static hm_t *reduce_dense_row_by_known_p const len_t ncl = mat->ncl; cf16_t * const * const mcf = mat->cf_16; -#ifdef HAVE_AVX2 +#ifdef __amd64__ uint32_t mone32 = (uint32_t)0xFFFFFFFF; uint16_t mone16 = (uint16_t)0xFFFF; uint32_t mone16h = (uint32_t)0xFFFF0000; @@ -157,82 +157,85 @@ static hm_t *reduce_dense_row_by_known_p } else { cfs = mcf[dts[COEFFS]]; } -#ifdef HAVE_AVX2 - const uint16_t mul16 = (uint16_t)(fc - dr[i]); - mulv = _mm256_set1_epi16(mul16); const len_t len = dts[LENGTH]; - const len_t os = len % 16; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] += mul * cfs[j]; - } - for (; j < len; j += 16) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - prodh = _mm256_mulhi_epu16(mulv, redv); - prodl = _mm256_mullo_epi16(mulv, redv); - prod = _mm256_xor_si256( - _mm256_and_si256(prodh, mask16h), _mm256_srli_epi32(prodl, 16)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+5]], - dr[ds[j+9]], - dr[ds[j+13]]); - resv = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32)); - _mm256_store_si256((__m256i*)(res),resv); - dr[ds[j+1]] = res[0]; - dr[ds[j+5]] = res[1]; - dr[ds[j+9]] = res[2]; - dr[ds[j+13]] = res[3]; - drv = _mm256_setr_epi64x( - dr[ds[j+3]], - dr[ds[j+7]], - dr[ds[j+11]], - dr[ds[j+15]]); - resv = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32)); - _mm256_store_si256((__m256i*)(res),resv); - dr[ds[j+3]] = res[0]; - dr[ds[j+7]] = res[1]; - dr[ds[j+11]] = res[2]; - dr[ds[j+15]] = res[3]; - prod = _mm256_xor_si256( - _mm256_slli_epi32(prodh, 16), _mm256_and_si256(prodl, mask16)); - drv = _mm256_setr_epi64x( - dr[ds[j+0]], - dr[ds[j+4]], - dr[ds[j+8]], - dr[ds[j+12]]); - resv = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32)); - _mm256_store_si256((__m256i*)(res),resv); - dr[ds[j+0]] = res[0]; - dr[ds[j+4]] = res[1]; - dr[ds[j+8]] = res[2]; - dr[ds[j+12]] = res[3]; - drv = _mm256_setr_epi64x( - dr[ds[j+2]], - dr[ds[j+6]], - dr[ds[j+10]], - dr[ds[j+14]]); - resv = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32)); - _mm256_store_si256((__m256i*)(res),resv); - dr[ds[j+2]] = res[0]; - dr[ds[j+6]] = res[1]; - dr[ds[j+10]] = res[2]; - dr[ds[j+14]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; - const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] += mul * cfs[j]; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] += mul * cfs[j]; - dr[ds[j+1]] += mul * cfs[j+1]; - dr[ds[j+2]] += mul * cfs[j+2]; - dr[ds[j+3]] += mul * cfs[j+3]; +#ifdef __amd64__ + if (have_avx2) { + const uint16_t mul16 = (uint16_t)(fc - dr[i]); + mulv = _mm256_set1_epi16(mul16); + const len_t os = len % 16; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] += mul * cfs[j]; + } + for (; j < len; j += 16) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + prodh = _mm256_mulhi_epu16(mulv, redv); + prodl = _mm256_mullo_epi16(mulv, redv); + prod = _mm256_xor_si256( + _mm256_and_si256(prodh, mask16h), _mm256_srli_epi32(prodl, 16)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+5]], + dr[ds[j+9]], + dr[ds[j+13]]); + resv = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32)); + _mm256_store_si256((__m256i*)(res),resv); + dr[ds[j+1]] = res[0]; + dr[ds[j+5]] = res[1]; + dr[ds[j+9]] = res[2]; + dr[ds[j+13]] = res[3]; + drv = _mm256_setr_epi64x( + dr[ds[j+3]], + dr[ds[j+7]], + dr[ds[j+11]], + dr[ds[j+15]]); + resv = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32)); + _mm256_store_si256((__m256i*)(res),resv); + dr[ds[j+3]] = res[0]; + dr[ds[j+7]] = res[1]; + dr[ds[j+11]] = res[2]; + dr[ds[j+15]] = res[3]; + prod = _mm256_xor_si256( + _mm256_slli_epi32(prodh, 16), _mm256_and_si256(prodl, mask16)); + drv = _mm256_setr_epi64x( + dr[ds[j+0]], + dr[ds[j+4]], + dr[ds[j+8]], + dr[ds[j+12]]); + resv = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32)); + _mm256_store_si256((__m256i*)(res),resv); + dr[ds[j+0]] = res[0]; + dr[ds[j+4]] = res[1]; + dr[ds[j+8]] = res[2]; + dr[ds[j+12]] = res[3]; + drv = _mm256_setr_epi64x( + dr[ds[j+2]], + dr[ds[j+6]], + dr[ds[j+10]], + dr[ds[j+14]]); + resv = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32)); + _mm256_store_si256((__m256i*)(res),resv); + dr[ds[j+2]] = res[0]; + dr[ds[j+6]] = res[1]; + dr[ds[j+10]] = res[2]; + dr[ds[j+14]] = res[3]; + } } + else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] += mul * cfs[j]; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] += mul * cfs[j]; + dr[ds[j+1]] += mul * cfs[j+1]; + dr[ds[j+2]] += mul * cfs[j+2]; + dr[ds[j+3]] += mul * cfs[j+3]; + } + } dr[i] = 0; } if (k == 0) { --- msolve-0.5.0/src/neogb/la_ff_32.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/neogb/la_ff_32.c 2023-07-28 11:18:29.383810853 -0600 @@ -20,7 +20,7 @@ #include "data.h" -#ifdef HAVE_AVX2 +#ifdef __amd64__ #include #endif @@ -319,7 +319,7 @@ static hm_t *reduce_dense_row_by_known_p const len_t ncl = mat->ncl; cf32_t * const * const mcf = mat->cf_32; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4]; __m256i redv, mulv, prodv, drv, resv; #endif @@ -348,58 +348,60 @@ static hm_t *reduce_dense_row_by_known_p } else { cfs = mcf[dts[COEFFS]]; } -#ifdef HAVE_AVX2 - const len_t len = dts[LENGTH]; - const len_t os = len % 8; - const hm_t * const ds = dts + OFFSET; - const uint32_t mul32 = (int32_t)(mod - dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] += mul * cfs[j]; - } - for (; j < len; j += 8) { - redv = _mm256_lddqu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_add_epi64(drv, prodv); - _mm256_storeu_si256((__m256i*)(res), resv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_add_epi64(drv, prodv); - _mm256_storeu_si256((__m256i*)(res), resv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] += mul * cfs[j]; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] += mul * cfs[j]; - dr[ds[j+1]] += mul * cfs[j+1]; - dr[ds[j+2]] += mul * cfs[j+2]; - dr[ds[j+3]] += mul * cfs[j+3]; - } +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + OFFSET; + const uint32_t mul32 = (int32_t)(mod - dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] += mul * cfs[j]; + } + for (; j < len; j += 8) { + redv = _mm256_lddqu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_add_epi64(drv, prodv); + _mm256_storeu_si256((__m256i*)(res), resv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_add_epi64(drv, prodv); + _mm256_storeu_si256((__m256i*)(res), resv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] += mul * cfs[j]; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] += mul * cfs[j]; + dr[ds[j+1]] += mul * cfs[j+1]; + dr[ds[j+2]] += mul * cfs[j+2]; + dr[ds[j+3]] += mul * cfs[j+3]; + } + } dr[i] = 0; st->application_nr_mult += len / 1000.0; st->application_nr_add += len / 1000.0; @@ -533,10 +535,10 @@ static hm_t *reduce_dense_row_by_known_p int64_t np = -1; const int64_t mod = (int64_t)st->fc; const int64_t mod2 = (int64_t)st->fc * st->fc; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4] __attribute__((aligned(32))); __m256i cmpv, redv, drv, mulv, prodv, resv, rresv; - __m256i zerov= _mm256_set1_epi64x(0); + __m256i zerov = _mm256_set1_epi64x(0); __m256i mod2v = _mm256_set1_epi64x(mod2); #endif @@ -558,68 +560,70 @@ static hm_t *reduce_dense_row_by_known_p const int64_t mul = (int64_t)dr[i]; dts = pivs[i]; cfs = bs->cf_32[dts[COEFFS]]; -#ifdef HAVE_AVX2 - const len_t len = dts[LENGTH]; - const len_t os = len % 8; - const hm_t * const ds = dts + OFFSET; - const uint32_t mul32 = (uint32_t)(dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j+1]] -= mul * cfs[j+1]; - dr[ds[j+2]] -= mul * cfs[j+2]; - dr[ds[j+3]] -= mul * cfs[j+3]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; - dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; - dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; - } +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + OFFSET; + const uint32_t mul32 = (uint32_t)(dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j+1]] -= mul * cfs[j+1]; + dr[ds[j+2]] -= mul * cfs[j+2]; + dr[ds[j+3]] -= mul * cfs[j+3]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; + dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; + dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; + } + } dr[i] = 0; st->application_nr_mult += len / 1000.0; st->application_nr_add += len / 1000.0; @@ -675,10 +679,10 @@ static hm_t *sba_reduce_dense_row_by_kno const int64_t mod = (int64_t)st->fc; const int64_t mod2 = (int64_t)st->fc * st->fc; const len_t nc = smat->nc; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4] __attribute__((aligned(32))); __m256i cmpv, redv, drv, mulv, prodv, resv, rresv; - __m256i zerov= _mm256_set1_epi64x(0); + __m256i zerov = _mm256_set1_epi64x(0); __m256i mod2v = _mm256_set1_epi64x(mod2); #endif @@ -712,68 +716,70 @@ static hm_t *sba_reduce_dense_row_by_kno dts = pivs[i]; cfs = smat->cc32[dts[SM_CFS]]; -#ifdef HAVE_AVX2 - const len_t len = dts[SM_LEN]; - const len_t os = len % 8; - const hm_t * const ds = dts + SM_OFFSET; - const uint32_t mul32 = (uint32_t)(dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } -#else - const len_t os = dts[SM_PRE]; - const len_t len = dts[SM_LEN]; - const hm_t * const ds = dts + SM_OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j+1]] -= mul * cfs[j+1]; - dr[ds[j+2]] -= mul * cfs[j+2]; - dr[ds[j+3]] -= mul * cfs[j+3]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; - dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; - dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; - } + const len_t len = dts[SM_LEN]; +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + SM_OFFSET; + const uint32_t mul32 = (uint32_t)(dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[SM_PRE]; + const hm_t * const ds = dts + SM_OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j+1]] -= mul * cfs[j+1]; + dr[ds[j+2]] -= mul * cfs[j+2]; + dr[ds[j+3]] -= mul * cfs[j+3]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; + dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; + dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; + } + } dr[i] = 0; st->application_nr_mult += len / 1000.0; st->application_nr_add += len / 1000.0; @@ -832,10 +838,10 @@ static hm_t *reduce_dense_row_by_known_p const len_t ncols = mat->nc; const len_t ncl = mat->ncl; cf32_t * const * const mcf = mat->cf_32; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4] __attribute__((aligned(32))); __m256i cmpv, redv, drv, mulv, prodv, resv, rresv; - __m256i zerov= _mm256_set1_epi64x(0); + __m256i zerov = _mm256_set1_epi64x(0); __m256i mod2v = _mm256_set1_epi64x(mod2); #endif @@ -863,68 +869,70 @@ static hm_t *reduce_dense_row_by_known_p } else { cfs = mcf[dts[COEFFS]]; } -#ifdef HAVE_AVX2 - const len_t len = dts[LENGTH]; - const len_t os = len % 8; - const hm_t * const ds = dts + OFFSET; - const uint32_t mul32 = (uint32_t)(dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; - const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j+1]] -= mul * cfs[j+1]; - dr[ds[j+2]] -= mul * cfs[j+2]; - dr[ds[j+3]] -= mul * cfs[j+3]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; - dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; - dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; - } + const len_t len = dts[LENGTH]; +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + OFFSET; + const uint32_t mul32 = (uint32_t)(dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j+1]] -= mul * cfs[j+1]; + dr[ds[j+2]] -= mul * cfs[j+2]; + dr[ds[j+3]] -= mul * cfs[j+3]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; + dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; + dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; + } + } dr[i] = 0; st->application_nr_mult += len / 1000.0; st->application_nr_add += len / 1000.0; @@ -976,10 +984,10 @@ static hm_t *reduce_dense_row_by_known_p int64_t np = -1; const int64_t mod = (int64_t)st->fc; const int64_t mod2 = (int64_t)st->fc * st->fc; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4] __attribute__((aligned(32))); __m256i cmpv, redv, drv, mulv, prodv, resv, rresv; - __m256i zerov= _mm256_set1_epi64x(0); + __m256i zerov = _mm256_set1_epi64x(0); __m256i mod2v = _mm256_set1_epi64x(mod2); #endif @@ -1005,127 +1013,129 @@ static hm_t *reduce_dense_row_by_known_p dtsm = mulh[dts[MULT]]; cfsm = mulcf[dtsm[COEFFS]]; cfs = pivcf[dts[COEFFS]]; -#ifdef HAVE_AVX2 - const len_t len = dts[LENGTH]; - const len_t os = len % 8; - const hm_t * const ds = dts + OFFSET; - const uint32_t mul32 = (uint32_t)(dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } - const len_t lenm = dtsm[LENGTH]; - const len_t osm = lenm % 8; - const hm_t * const dsm = dtsm + OFFSET; - /* const uint32_t mulm32 = (uint32_t)(drm[i]); */ - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < osm; ++j) { - drm[dsm[j]] -= mul * cfsm[j]; - drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; - } - for (; j < lenm; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfsm+j)); - drv = _mm256_setr_epi64x( - drm[dsm[j+1]], - drm[dsm[j+3]], - drm[dsm[j+5]], - drm[dsm[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - drm[dsm[j+1]] = res[0]; - drm[dsm[j+3]] = res[1]; - drm[dsm[j+5]] = res[2]; - drm[dsm[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - drm[dsm[j]], - drm[dsm[j+2]], - drm[dsm[j+4]], - drm[dsm[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - drm[dsm[j]] = res[0]; - drm[dsm[j+2]] = res[1]; - drm[dsm[j+4]] = res[2]; - drm[dsm[j+6]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; - const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j+1]] -= mul * cfs[j+1]; - dr[ds[j+2]] -= mul * cfs[j+2]; - dr[ds[j+3]] -= mul * cfs[j+3]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; - dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; - dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; - } - const len_t osm = dtsm[PRELOOP]; - const len_t lenm = dtsm[LENGTH]; - const hm_t * const dsm = dtsm + OFFSET; - for (j = 0; j < osm; ++j) { - drm[dsm[j]] -= mul * cfsm[j]; - drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; - } - for (; j < lenm; j += UNROLL) { - drm[dsm[j]] -= mul * cfsm[j]; - drm[dsm[j+1]] -= mul * cfsm[j+1]; - drm[dsm[j+2]] -= mul * cfsm[j+2]; - drm[dsm[j+3]] -= mul * cfsm[j+3]; - drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; - drm[dsm[j+1]] += (drm[dsm[j+1]] >> 63) & mod2; - drm[dsm[j+2]] += (drm[dsm[j+2]] >> 63) & mod2; - drm[dsm[j+3]] += (drm[dsm[j+3]] >> 63) & mod2; - } + const len_t len = dts[LENGTH]; +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + OFFSET; + const uint32_t mul32 = (uint32_t)(dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + const len_t lenm = dtsm[LENGTH]; + const len_t osm = lenm % 8; + const hm_t * const dsm = dtsm + OFFSET; + /* const uint32_t mulm32 = (uint32_t)(drm[i]); */ + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < osm; ++j) { + drm[dsm[j]] -= mul * cfsm[j]; + drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; + } + for (; j < lenm; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfsm+j)); + drv = _mm256_setr_epi64x( + drm[dsm[j+1]], + drm[dsm[j+3]], + drm[dsm[j+5]], + drm[dsm[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + drm[dsm[j+1]] = res[0]; + drm[dsm[j+3]] = res[1]; + drm[dsm[j+5]] = res[2]; + drm[dsm[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + drm[dsm[j]], + drm[dsm[j+2]], + drm[dsm[j+4]], + drm[dsm[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + drm[dsm[j]] = res[0]; + drm[dsm[j+2]] = res[1]; + drm[dsm[j+4]] = res[2]; + drm[dsm[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j+1]] -= mul * cfs[j+1]; + dr[ds[j+2]] -= mul * cfs[j+2]; + dr[ds[j+3]] -= mul * cfs[j+3]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; + dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; + dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; + } + const len_t osm = dtsm[PRELOOP]; + const len_t lenm = dtsm[LENGTH]; + const hm_t * const dsm = dtsm + OFFSET; + for (j = 0; j < osm; ++j) { + drm[dsm[j]] -= mul * cfsm[j]; + drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; + } + for (; j < lenm; j += UNROLL) { + drm[dsm[j]] -= mul * cfsm[j]; + drm[dsm[j+1]] -= mul * cfsm[j+1]; + drm[dsm[j+2]] -= mul * cfsm[j+2]; + drm[dsm[j+3]] -= mul * cfsm[j+3]; + drm[dsm[j]] += (drm[dsm[j]] >> 63) & mod2; + drm[dsm[j+1]] += (drm[dsm[j+1]] >> 63) & mod2; + drm[dsm[j+2]] += (drm[dsm[j+2]] >> 63) & mod2; + drm[dsm[j+3]] += (drm[dsm[j+3]] >> 63) & mod2; + } + } dr[i] = 0; st->application_nr_mult += len / 1000.0; st->application_nr_add += len / 1000.0; @@ -1211,7 +1221,7 @@ static hm_t *trace_reduce_dense_row_by_k const len_t ncols = mat->nc; const len_t ncl = mat->ncl; cf32_t * const * const mcf = mat->cf_32; -#ifdef HAVE_AVX2 +#ifdef __amd64__ int64_t res[4] __attribute__((aligned(32))); __m256i cmpv, redv, drv, mulv, prodv, resv, rresv; __m256i zerov = _mm256_set1_epi64x(0); @@ -1245,68 +1255,70 @@ static hm_t *trace_reduce_dense_row_by_k cfs = mcf[dts[COEFFS]]; } -#ifdef HAVE_AVX2 - const len_t len = dts[LENGTH]; - const len_t os = len % 8; - const hm_t * const ds = dts + OFFSET; - const uint32_t mul32 = (uint32_t)(dr[i]); - mulv = _mm256_set1_epi32(mul32); - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += 8) { - redv = _mm256_loadu_si256((__m256i*)(cfs+j)); - drv = _mm256_setr_epi64x( - dr[ds[j+1]], - dr[ds[j+3]], - dr[ds[j+5]], - dr[ds[j+7]]); - /* first four mult-adds -- lower */ - prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j+1]] = res[0]; - dr[ds[j+3]] = res[1]; - dr[ds[j+5]] = res[2]; - dr[ds[j+7]] = res[3]; - /* second four mult-adds -- higher */ - prodv = _mm256_mul_epu32(mulv, redv); - drv = _mm256_setr_epi64x( - dr[ds[j]], - dr[ds[j+2]], - dr[ds[j+4]], - dr[ds[j+6]]); - resv = _mm256_sub_epi64(drv, prodv); - cmpv = _mm256_cmpgt_epi64(zerov, resv); - rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); - _mm256_store_si256((__m256i*)(res), rresv); - dr[ds[j]] = res[0]; - dr[ds[j+2]] = res[1]; - dr[ds[j+4]] = res[2]; - dr[ds[j+6]] = res[3]; - } -#else - const len_t os = dts[PRELOOP]; - const len_t len = dts[LENGTH]; - const hm_t * const ds = dts + OFFSET; - for (j = 0; j < os; ++j) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - } - for (; j < len; j += UNROLL) { - dr[ds[j]] -= mul * cfs[j]; - dr[ds[j+1]] -= mul * cfs[j+1]; - dr[ds[j+2]] -= mul * cfs[j+2]; - dr[ds[j+3]] -= mul * cfs[j+3]; - dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; - dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; - dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; - dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; - } + const len_t len = dts[LENGTH]; +#ifdef __amd64__ + if (have_avx2) { + const len_t os = len % 8; + const hm_t * const ds = dts + OFFSET; + const uint32_t mul32 = (uint32_t)(dr[i]); + mulv = _mm256_set1_epi32(mul32); + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += 8) { + redv = _mm256_loadu_si256((__m256i*)(cfs+j)); + drv = _mm256_setr_epi64x( + dr[ds[j+1]], + dr[ds[j+3]], + dr[ds[j+5]], + dr[ds[j+7]]); + /* first four mult-adds -- lower */ + prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32)); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j+1]] = res[0]; + dr[ds[j+3]] = res[1]; + dr[ds[j+5]] = res[2]; + dr[ds[j+7]] = res[3]; + /* second four mult-adds -- higher */ + prodv = _mm256_mul_epu32(mulv, redv); + drv = _mm256_setr_epi64x( + dr[ds[j]], + dr[ds[j+2]], + dr[ds[j+4]], + dr[ds[j+6]]); + resv = _mm256_sub_epi64(drv, prodv); + cmpv = _mm256_cmpgt_epi64(zerov, resv); + rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v)); + _mm256_store_si256((__m256i*)(res), rresv); + dr[ds[j]] = res[0]; + dr[ds[j+2]] = res[1]; + dr[ds[j+4]] = res[2]; + dr[ds[j+6]] = res[3]; + } + } else #endif + { + const len_t os = dts[PRELOOP]; + const hm_t * const ds = dts + OFFSET; + for (j = 0; j < os; ++j) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + } + for (; j < len; j += UNROLL) { + dr[ds[j]] -= mul * cfs[j]; + dr[ds[j+1]] -= mul * cfs[j+1]; + dr[ds[j+2]] -= mul * cfs[j+2]; + dr[ds[j+3]] -= mul * cfs[j+3]; + dr[ds[j]] += (dr[ds[j]] >> 63) & mod2; + dr[ds[j+1]] += (dr[ds[j+1]] >> 63) & mod2; + dr[ds[j+2]] += (dr[ds[j+2]] >> 63) & mod2; + dr[ds[j+3]] += (dr[ds[j+3]] >> 63) & mod2; + } + } dr[i] = 0; st->trace_nr_mult += len / 1000.0; st->trace_nr_add += len / 1000.0; --- msolve-0.5.0/src/neogb/symbol.c.orig 2023-07-04 01:31:27.000000000 -0600 +++ msolve-0.5.0/src/neogb/symbol.c 2023-07-28 11:16:27.465530091 -0600 @@ -21,10 +21,6 @@ #include "data.h" -#ifdef HAVE_AVX2 -#include -#endif - /* select_all_pairs() is unused at the moment */ #if 0 static void select_all_spairs(