--- msolve-0.5.0/m4/ax_ext.m4.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/m4/ax_ext.m4	2023-07-28 10:50:28.215653884 -0600
@@ -55,274 +55,12 @@ AC_DEFUN([AX_EXT],
   SIMD_FLAGS=""
 
   case $host_cpu in
-    powerpc*)
-      AC_CACHE_CHECK([whether altivec is supported for old distros], [ax_cv_have_altivec_old_ext],
-          [
-            if test `/usr/sbin/sysctl -a 2>/dev/null| grep -c hw.optional.altivec` != 0; then
-                if test `/usr/sbin/sysctl -n hw.optional.altivec` = 1; then
-                  ax_cv_have_altivec_old_ext=yes
-                fi
-            fi
-          ])
-
-          if test "$ax_cv_have_altivec_old_ext" = yes; then
-            AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
-            AX_CHECK_COMPILE_FLAG(-faltivec, SIMD_FLAGS="$SIMD_FLAGS -faltivec", [])
-          fi
-
-      AC_CACHE_CHECK([whether altivec is supported], [ax_cv_have_altivec_ext],
-          [
-            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c altivec` != 0; then
-              ax_cv_have_altivec_ext=yes
-            fi
-          ])
-
-          if test "$ax_cv_have_altivec_ext" = yes; then
-            AC_DEFINE(HAVE_ALTIVEC,,[Support Altivec instructions])
-            AX_CHECK_COMPILE_FLAG(-maltivec, SIMD_FLAGS="$SIMD_FLAGS -maltivec", [])
-          fi
-
-      AC_CACHE_CHECK([whether vsx is supported], [ax_cv_have_vsx_ext],
-          [
-            if test `LD_SHOW_AUXV=1 /bin/true 2>/dev/null|grep -c vsx` != 0; then
-                ax_cv_have_vsx_ext=yes
-            fi
-          ])
-
-          if test "$ax_cv_have_vsx_ext" = yes; then
-            AC_DEFINE(HAVE_VSX,,[Support VSX instructions])
-            AX_CHECK_COMPILE_FLAG(-mvsx, SIMD_FLAGS="$SIMD_FLAGS -mvsx", [])
-          fi
-    ;;
-
     i[[3456]]86*|x86_64*|amd64*)
 
-      AC_REQUIRE([AX_GCC_X86_CPUID])
-      AC_REQUIRE([AX_GCC_X86_CPUID_COUNT])
-      AC_REQUIRE([AX_GCC_X86_AVX_XGETBV])
-
-      eax_cpuid0=0
-      AX_GCC_X86_CPUID(0x00000000)
-      if test "$ax_cv_gcc_x86_cpuid_0x00000000" != "unknown";
-      then
-        eax_cpuid0=`echo $ax_cv_gcc_x86_cpuid_0x00000000 | cut -d ":" -f 1`
-      fi
-
-      eax_cpuid80000000=0
-      AX_GCC_X86_CPUID(0x80000000)
-      if test "$ax_cv_gcc_x86_cpuid_0x80000000" != "unknown";
-      then
-        eax_cpuid80000000=`echo $ax_cv_gcc_x86_cpuid_0x80000000 | cut -d ":" -f 1`
-      fi
-
-      ecx_cpuid1=0
-      edx_cpuid1=0
-      if test "$((0x$eax_cpuid0))" -ge 1 ; then
-        AX_GCC_X86_CPUID(0x00000001)
-        if test "$ax_cv_gcc_x86_cpuid_0x00000001" != "unknown";
-        then
-          ecx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 3`
-          edx_cpuid1=`echo $ax_cv_gcc_x86_cpuid_0x00000001 | cut -d ":" -f 4`
-        fi
-      fi
-
-      ebx_cpuid7=0
-      ecx_cpuid7=0
-      if test "$((0x$eax_cpuid0))" -ge 7 ; then
-        AX_GCC_X86_CPUID_COUNT(0x00000007, 0x00)
-        if test "$ax_cv_gcc_x86_cpuid_0x00000007" != "unknown";
-        then
-          ebx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 2`
-          ecx_cpuid7=`echo $ax_cv_gcc_x86_cpuid_0x00000007 | cut -d ":" -f 3`
-        fi
-      fi
-
-      ecx_cpuid80000001=0
-      edx_cpuid80000001=0
-      if test "$((0x$eax_cpuid80000000))" -ge "$((0x80000001))" ; then
-        AX_GCC_X86_CPUID(0x80000001)
-        if test "$ax_cv_gcc_x86_cpuid_0x80000001" != "unknown";
-        then
-          ecx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 3`
-          edx_cpuid80000001=`echo $ax_cv_gcc_x86_cpuid_0x80000001 | cut -d ":" -f 4`
-        fi
-      fi
-
-      AC_CACHE_VAL([ax_cv_have_mmx_os_support_ext],
-      [
-        ax_cv_have_mmx_os_support_ext=yes
-      ])
-
-      ax_cv_have_none_os_support_ext=yes
-
-      AC_CACHE_VAL([ax_cv_have_sse_os_support_ext],
-      [
-        ax_cv_have_sse_os_support_ext=no,
-        if test "$((0x$edx_cpuid1>>25&0x01))" = 1; then
-          AC_LANG_PUSH([C])
-          AC_RUN_IFELSE([AC_LANG_SOURCE([[
-#include <signal.h>
-#include <stdlib.h>
-            /* No way at ring1 to ring3 in protected mode to check the CR0 and CR4
-               control registers directly. Execute an SSE instruction.
-               If it raises SIGILL then OS doesn't support SSE based instructions */
-            void sig_handler(int signum){ exit(1); }
-            int main(){
-              signal(SIGILL, sig_handler);
-              /* SSE instruction xorps  %xmm0,%xmm0 */
-              __asm__ __volatile__ (".byte 0x0f, 0x57, 0xc0");
-              return 0;
-            }]])],
-            [ax_cv_have_sse_os_support_ext=yes],
-            [ax_cv_have_sse_os_support_ext=no],
-            [ax_cv_have_sse_os_support_ext=no])
-          AC_LANG_POP([C])
-        fi
-      ])
-
-      xgetbv_eax=0
-      if test "$((0x$ecx_cpuid1>>28&0x01))" = 1; then
-        AX_GCC_X86_AVX_XGETBV(0x00000000)
-
-        if test x"$ax_cv_gcc_x86_avx_xgetbv_0x00000000" != x"unknown"; then
-          xgetbv_eax=`echo $ax_cv_gcc_x86_avx_xgetbv_0x00000000 | cut -d ":" -f 1`
-        fi
-
-        AC_CACHE_VAL([ax_cv_have_avx_os_support_ext],
-        [
-          ax_cv_have_avx_os_support_ext=no
-          if test "$((0x$ecx_cpuid1>>27&0x01))" = 1; then
-            if test "$((0x$xgetbv_eax&0x6))" = 6; then
-              ax_cv_have_avx_os_support_ext=yes
-            fi
-          fi
-        ])
-      fi
-
-      AC_CACHE_VAL([ax_cv_have_avx512_os_support_ext],
-      [
-        ax_cv_have_avx512_os_support_ext=no
-        if test "$ax_cv_have_avx_os_support_ext" = yes; then
-          if test "$((0x$xgetbv_eax&0xe6))" = "$((0xe6))"; then
-            ax_cv_have_avx512_os_support_ext=yes
-          fi
-        fi
-      ])
-
-      for ac_instr_info dnl
-      in "none;rdrnd;RDRND;ecx_cpuid1,30;-mrdrnd;HAVE_RDRND;CPUEXT_FLAGS" dnl
-         "none;bmi1;BMI1;ebx_cpuid7,3;-mbmi;HAVE_BMI1;CPUEXT_FLAGS" dnl
-         "none;bmi2;BMI2;ebx_cpuid7,8;-mbmi2;HAVE_BMI2;CPUEXT_FLAGS" dnl
-         "none;adx;ADX;ebx_cpuid7,19;-madx;HAVE_ADX;CPUEXT_FLAGS" dnl
-         dnl "none;mpx;MPX;ebx_cpuid7,14;-mmpx;HAVE_MPX;CPUEXT_FLAGS" dnl
-         "none;prefetchwt1;PREFETCHWT1;ecx_cpuid7,0;-mprefetchwt1;HAVE_PREFETCHWT1;CPUEXT_FLAGS" dnl
-         "none;abm;ABM;ecx_cpuid80000001,5;-mabm;HAVE_ABM;CPUEXT_FLAGS" dnl
-         "mmx;mmx;MMX;edx_cpuid1,23;-mmmx;HAVE_MMX;SIMD_FLAGS" dnl
-         "sse;sse;SSE;edx_cpuid1,25;-msse;HAVE_SSE;SIMD_FLAGS" dnl
-         "sse;sse2;SSE2;edx_cpuid1,26;-msse2;HAVE_SSE2;SIMD_FLAGS" dnl
-         "sse;sse3;SSE3;ecx_cpuid1,1;-msse3;HAVE_SSE3;SIMD_FLAGS" dnl
-         "sse;ssse3;SSSE3;ecx_cpuid1,9;-mssse3;HAVE_SSSE3;SIMD_FLAGS" dnl
-         "sse;sse41;SSE4.1;ecx_cpuid1,19;-msse4.1;HAVE_SSE4_1;SIMD_FLAGS" dnl
-         "sse;sse42;SSE4.2;ecx_cpuid1,20;-msse4.2;HAVE_SSE4_2;SIMD_FLAGS" dnl
-         "sse;sse4a;SSE4a;ecx_cpuid80000001,6;-msse4a;HAVE_SSE4a;SIMD_FLAGS" dnl
-         "sse;sha;SHA;ebx_cpuid7,29;-msha;HAVE_SHA;SIMD_FLAGS" dnl
-         "sse;aes;AES;ecx_cpuid1,25;-maes;HAVE_AES;SIMD_FLAGS" dnl
-         "avx;avx;AVX;ecx_cpuid1,28;-mavx;HAVE_AVX;SIMD_FLAGS" dnl
-         "avx;fma3;FMA3;ecx_cpuid1,12;-mfma;HAVE_FMA3;SIMD_FLAGS" dnl
-         "avx;fma4;FMA4;ecx_cpuid80000001,16;-mfma4;HAVE_FMA4;SIMD_FLAGS" dnl
-         "avx;xop;XOP;ecx_cpuid80000001,11;-mxop;HAVE_XOP;SIMD_FLAGS" dnl
-         "avx;avx2;AVX2;ebx_cpuid7,5;-mavx2;HAVE_AVX2;SIMD_FLAGS" dnl
-         "avx512;avx512f;AVX512-F;ebx_cpuid7,16;-mavx512f;HAVE_AVX512_F;SIMD_FLAGS" dnl
-         "avx512;avx512cd;AVX512-CD;ebx_cpuid7,28;-mavx512cd;HAVE_AVX512_CD;SIMD_FLAGS" dnl
-         "avx512;avx512pf;AVX512-PF;ebx_cpuid7,26;-mavx512pf;HAVE_AVX512_PF;SIMD_FLAGS" dnl
-         "avx512;avx512er;AVX512-ER;ebx_cpuid7,27;-mavx512er;HAVE_AVX512_ER;SIMD_FLAGS" dnl
-         "avx512;avx512vl;AVX512-VL;ebx_cpuid7,31;-mavx512vl;HAVE_AVX512_VL;SIMD_FLAGS" dnl
-         "avx512;avx512bw;AVX512-BW;ebx_cpuid7,30;-mavx512bw;HAVE_AVX512_BW;SIMD_FLAGS" dnl
-         "avx512;avx512dq;AVX512-DQ;ebx_cpuid7,17;-mavx512dq;HAVE_AVX512_DQ;SIMD_FLAGS" dnl
-         "avx512;avx512ifma;AVX512-IFMA;ebx_cpuid7,21;-mavx512ifma;HAVE_AVX512_IFMA;SIMD_FLAGS" dnl
-         "avx512;avx512vbmi;AVX512-VBMI;ecx_cpuid7,1;-mavx512vbmi;HAVE_AVX512_VBMI;SIMD_FLAGS" dnl
-         #
-      do ac_instr_os_support=$(eval echo \$ax_cv_have_$(echo $ac_instr_info | cut -d ";" -f 1)_os_support_ext)
-         ac_instr_acvar=$(echo $ac_instr_info | cut -d ";" -f 2)
-         ac_instr_shortname=$(echo $ac_instr_info | cut -d ";" -f 3)
-         ac_instr_chk_loc=$(echo $ac_instr_info | cut -d ";" -f 4)
-         ac_instr_chk_reg=0x$(eval echo \$$(echo $ac_instr_chk_loc | cut -d "," -f 1))
-         ac_instr_chk_bit=$(echo $ac_instr_chk_loc | cut -d "," -f 2)
-         ac_instr_compiler_flags=$(echo $ac_instr_info | cut -d ";" -f 5)
-         ac_instr_have_define=$(echo $ac_instr_info | cut -d ";" -f 6)
-         ac_instr_flag_type=$(echo $ac_instr_info | cut -d ";" -f 7)
-
-         AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor], [ax_cv_have_${ac_instr_acvar}_cpu_ext],
-         [
-           eval ax_cv_have_${ac_instr_acvar}_cpu_ext=no
-           if test "$((${ac_instr_chk_reg}>>${ac_instr_chk_bit}&0x01))" = 1 ; then
-             eval ax_cv_have_${ac_instr_acvar}_cpu_ext=yes
-           fi
-         ])
-
-         if test x"$(eval echo \$ax_cv_have_${ac_instr_acvar}_cpu_ext)" = x"yes"; then
-           AC_CACHE_CHECK([whether ${ac_instr_shortname} is supported by the processor and OS], [ax_cv_have_${ac_instr_acvar}_ext],
-           [
-             eval ax_cv_have_${ac_instr_acvar}_ext=no
-             if test x"${ac_instr_os_support}" = x"yes"; then
-               eval ax_cv_have_${ac_instr_acvar}_ext=yes
-             fi
-           ])
-
-           if test "$(eval echo \$ax_cv_have_${ac_instr_acvar}_ext)" = yes; then
-             AX_CHECK_COMPILE_FLAG(${ac_instr_compiler_flags}, eval ax_cv_support_${ac_instr_acvar}_ext=yes,
-                                                               eval ax_cv_support_${ac_instr_acvar}_ext=no)
-             if test x"$(eval echo \$ax_cv_support_${ac_instr_acvar}_ext)" = x"yes"; then
-               eval ${ac_instr_flag_type}=\"\$${ac_instr_flag_type} ${ac_instr_compiler_flags}\"
-               AC_DEFINE_UNQUOTED([${ac_instr_have_define}])
-             else
-               AC_MSG_WARN([Your processor and OS supports ${ac_instr_shortname} instructions but not your compiler, can you try another compiler?])
-             fi
-           else
-             if test x"${ac_instr_os_support}" = x"no"; then
-               AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no)
-               AC_MSG_WARN([Your processor supports ${ac_instr_shortname}, but your OS doesn't])
-             fi
-           fi
-         else
-           AC_CACHE_VAL(ax_cv_have_${ac_instr_acvar}_ext, eval ax_cv_have_${ac_instr_acvar}_ext=no)
-           AC_CACHE_VAL(ax_cv_support_${ac_instr_acvar}_ext, eval ax_cv_support_${ac_instr_acvar}_ext=no)
-         fi
-      done
+      SIMD_FLAGS="-mavx2"
   ;;
   esac
 
-  AH_TEMPLATE([HAVE_RDRND],[Define to 1 to support Digital Random Number Generator])
-  AH_TEMPLATE([HAVE_BMI1],[Define to 1 to support Bit Manipulation Instruction Set 1])
-  AH_TEMPLATE([HAVE_BMI2],[Define to 1 to support Bit Manipulation Instruction Set 2])
-  AH_TEMPLATE([HAVE_ADX],[Define to 1 to support Multi-Precision Add-Carry Instruction Extensions])
-  AH_TEMPLATE([HAVE_MPX],[Define to 1 to support Memory Protection Extensions])
-  AH_TEMPLATE([HAVE_PREFETCHWT1],[Define to 1 to support Prefetch Vector Data Into Caches WT1])
-  AH_TEMPLATE([HAVE_ABM],[Define to 1 to support Advanced Bit Manipulation])
-  AH_TEMPLATE([HAVE_MMX],[Define to 1 to support Multimedia Extensions])
-  AH_TEMPLATE([HAVE_SSE],[Define to 1 to support Streaming SIMD Extensions])
-  AH_TEMPLATE([HAVE_SSE2],[Define to 1 to support Streaming SIMD Extensions])
-  AH_TEMPLATE([HAVE_SSE3],[Define to 1 to support Streaming SIMD Extensions 3])
-  AH_TEMPLATE([HAVE_SSSE3],[Define to 1 to support Supplemental Streaming SIMD Extensions 3])
-  AH_TEMPLATE([HAVE_SSE4_1],[Define to 1 to support Streaming SIMD Extensions 4.1])
-  AH_TEMPLATE([HAVE_SSE4_2],[Define to 1 to support Streaming SIMD Extensions 4.2])
-  AH_TEMPLATE([HAVE_SSE4a],[Define to 1 to support AMD Streaming SIMD Extensions 4a])
-  AH_TEMPLATE([HAVE_SHA],[Define to 1 to support Secure Hash Algorithm Extension])
-  AH_TEMPLATE([HAVE_AES],[Define to 1 to support Advanced Encryption Standard New Instruction Set (AES-NI)])
-  AH_TEMPLATE([HAVE_AVX],[Define to 1 to support Advanced Vector Extensions])
-  AH_TEMPLATE([HAVE_FMA3],[Define to 1 to support  Fused Multiply-Add Extensions 3])
-  AH_TEMPLATE([HAVE_FMA4],[Define to 1 to support Fused Multiply-Add Extensions 4])
-  AH_TEMPLATE([HAVE_XOP],[Define to 1 to support eXtended Operations Extensions])
-  AH_TEMPLATE([HAVE_AVX2],[Define to 1 to support Advanced Vector Extensions 2])
-  AH_TEMPLATE([HAVE_AVX512_F],[Define to 1 to support AVX-512 Foundation Extensions])
-  AH_TEMPLATE([HAVE_AVX512_CD],[Define to 1 to support AVX-512 Conflict Detection Instructions])
-  AH_TEMPLATE([HAVE_AVX512_PF],[Define to 1 to support AVX-512 Conflict Prefetch Instructions])
-  AH_TEMPLATE([HAVE_AVX512_ER],[Define to 1 to support AVX-512 Exponential & Reciprocal Instructions])
-  AH_TEMPLATE([HAVE_AVX512_VL],[Define to 1 to support AVX-512 Vector Length Extensions])
-  AH_TEMPLATE([HAVE_AVX512_BW],[Define to 1 to support AVX-512 Byte and Word Instructions])
-  AH_TEMPLATE([HAVE_AVX512_DQ],[Define to 1 to support AVX-512 Doubleword and Quadword Instructions])
-  AH_TEMPLATE([HAVE_AVX512_IFMA],[Define to 1 to support AVX-512 Integer Fused Multiply Add Instructions])
-  AH_TEMPLATE([HAVE_AVX512_VBMI],[Define to 1 to support AVX-512 Vector Byte Manipulation Instructions])
   AC_SUBST(SIMD_FLAGS)
   AC_SUBST(CPUEXT_FLAGS)
 ])
--- msolve-0.5.0/src/fglm/fglm_core.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/fglm/fglm_core.c	2023-07-28 14:08:11.775979716 -0600
@@ -41,6 +41,8 @@ double omp_get_wtime(void) { return real
 #define DEBUGFGLM 0
 #define BLOCKWIED 0
 
+extern _Bool have_avx2;
+
 #include "data_fglm.c"
 #include "linalg-fglm.c"
 #include "matrix-mult.c"
@@ -449,15 +451,16 @@ static inline void sparse_mat_fglm_mult_
     res[mat->triv_idx[i]] = vec[mat->triv_pos[i]];
   }
   /* printf("ncols %u\n", ncols); */
-#ifdef HAVE_AVX2
-  /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */
-  _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst,
-                              ncols, nrows, prime, RED_32, RED_64,
-			      preinv,st);
-#else
-  non_avx_matrix_vector_product(vres, mat->dense_mat, vec,
-				ncols, nrows, prime, RED_32, RED_64,st);
+#ifdef __amd64__
+  if (have_avx2)
+    /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */
+    _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst,
+                                ncols, nrows, prime, RED_32, RED_64,
+                                preinv,st);
+  else
 #endif
+    non_avx_matrix_vector_product(vres, mat->dense_mat, vec,
+                                  ncols, nrows, prime, RED_32, RED_64,st);
     for(szmat_t i = 0; i < nrows; i++){
       res[mat->dense_idx[i]] = vres[i];
     }
@@ -498,17 +501,18 @@ static inline void sparse_mat_fglm_colon
   }
   /* printf ("zero\n"); */
   /* printf("ncols %u\n", ncols); */
-#ifdef HAVE_AVX2
-  /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */
-  _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst,
-                              ncols, nrows, prime, RED_32, RED_64,
-			      preinv,st);
+#ifdef __amd64__
+  if (have_avx2)
+    /* matrix_vector_product(vres, mat->dense_mat, vec, ncols, nrows, prime, RED_32, RED_64); */
+    _8mul_matrix_vector_product(vres, mat->dense_mat, vec, mat->dst,
+                                ncols, nrows, prime, RED_32, RED_64,
+                                preinv,st);
   /* printf ("mul AVX\n"); */
-#else
-  non_avx_matrix_vector_product(vres, mat->dense_mat, vec,
-				ncols, nrows, prime, RED_32, RED_64,st);
-  /* printf ("mul non AVX\n"); */
+  else
 #endif
+    non_avx_matrix_vector_product(vres, mat->dense_mat, vec,
+                                  ncols, nrows, prime, RED_32, RED_64,st);
+  /* printf ("mul non AVX\n"); */
     for(szmat_t i = 0; i < nrows; i++){
       res[mat->dense_idx[i]] = vres[i];
     }
--- msolve-0.5.0/src/fglm/inner-product.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/fglm/inner-product.c	2023-07-27 17:14:03.128200166 -0600
@@ -22,7 +22,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 #include <immintrin.h>
 
 #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A))
@@ -257,7 +257,7 @@ static inline void non_avx_matrix_vector
     }
 }
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 static inline void matrix_vector_product(uint32_t* vec_res, const uint32_t* mat,
                                          const uint32_t* vec, const uint32_t ncols,
                                          const uint32_t nrows, const uint32_t PRIME,
--- msolve-0.5.0/src/fglm/linalg-fglm.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/fglm/linalg-fglm.c	2023-07-27 17:14:28.342887244 -0600
@@ -22,7 +22,7 @@
 #include <stdint.h>
 #include <string.h>
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 #include <immintrin.h>
 
 #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A))
@@ -230,7 +230,7 @@ static inline void non_avx_matrix_vector
     }
 }
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 static inline void matrix_vector_product(uint32_t* vec_res, const uint32_t* mat,
                                          const uint32_t* vec, const uint32_t ncols,
                                          const uint32_t nrows, const uint32_t PRIME,
--- msolve-0.5.0/src/fglm/matrix-mult.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/fglm/matrix-mult.c	2023-07-27 17:16:17.950526960 -0600
@@ -21,7 +21,7 @@
 //-1 sur 32 bits
 #define MONE32 ((uint32_t)0xFFFFFFFF)
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 #include <immintrin.h>
 
 #define AVX2LOAD(A) _mm256_load_si256((__m256i*)(A))
@@ -110,7 +110,7 @@ static inline uint64_t SUBMODRED32(uint6
   return (a < b) ? (p + neg) : (neg);
 }
 
-#if HAVE_AVX2
+#if __amd64__
 static inline void REDUCE(uint64_t *acc64, uint64_t *acc4x64,
                           __m256i acc_low, __m256i acc_high,
                           const uint32_t fc, const uint32_t preinv,
@@ -165,7 +165,7 @@ static __inline__ void make_zero(uint32_
 
 }
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 /*
   m = number of rows in A
   l = number of cols in A = number of rows in B
@@ -283,15 +283,18 @@ static inline void sparse_matfglm_mul(CF
 
 
   /* real product */
-#ifdef HAVE_AVX2
-  _mod_mat_addmul_transpose_op(tres, matxn->dense_mat, R,
-                               matxn->nrows, matxn->ncols, nc,
-                               prime, preinv,
-                               RED_32, RED_64);
-#else
-  fprintf(stderr, "Not implemented yet\n");
-  exit(1);
+#ifdef __amd64__
+  if (have_avx2)
+    _mod_mat_addmul_transpose_op(tres, matxn->dense_mat, R,
+                                 matxn->nrows, matxn->ncols, nc,
+                                 prime, preinv,
+                                 RED_32, RED_64);
+  else
 #endif
+  {
+    fprintf(stderr, "Not implemented yet\n");
+    exit(1);
+  }
 
   for(szmat_t j = 0; j < nrows; j++){
     for(int i = 0; i < nc; i++){
--- msolve-0.5.0/src/neogb/data.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/neogb/data.c	2023-07-28 11:18:11.344065251 -0600
@@ -21,6 +21,19 @@
 
 #include "data.h"
 
+/* check for AVX2 availability */
+#ifdef __amd64__
+#include <cpuid.h>
+_Bool have_avx2;
+
+static void __attribute__((constructor))
+set_avx2_flag(void)
+{
+  unsigned int eax, ebx, ecx, edx;
+  have_avx2 = __get_cpuid(7, &eax, &ebx, &ecx, &edx) && (ebx & bit_AVX2) != 0;
+}
+#endif
+
 /* function pointers */
 /* bs_t *(*initialize_basis)(
  *         const int32_t ngens
--- msolve-0.5.0/src/neogb/data.h.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/neogb/data.h	2023-07-28 11:17:26.592696306 -0600
@@ -40,6 +40,11 @@ inline omp_int_t omp_get_thread_num(void
 inline omp_int_t omp_get_max_threads(void) { return 1;}
 #endif
 
+/* check if AVX2 is available */
+#ifdef __amd64__
+extern _Bool have_avx2;
+#endif
+
 #define PARALLEL_HASHING 1
 #define ORDER_COLUMNS 1
 /* loop unrolling in sparse linear algebra:
--- msolve-0.5.0/src/neogb/la_ff_16.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/neogb/la_ff_16.c	2023-07-28 11:18:23.967887228 -0600
@@ -20,7 +20,7 @@
 
 #include "data.h"
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 #include <immintrin.h>
 #endif
 
@@ -122,7 +122,7 @@ static hm_t *reduce_dense_row_by_known_p
     const len_t ncl             = mat->ncl;
     cf16_t * const * const mcf  = mat->cf_16;
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     uint32_t mone32   = (uint32_t)0xFFFFFFFF;
     uint16_t mone16   = (uint16_t)0xFFFF;
     uint32_t mone16h  = (uint32_t)0xFFFF0000;
@@ -157,82 +157,85 @@ static hm_t *reduce_dense_row_by_known_p
         } else {
             cfs   = mcf[dts[COEFFS]];
         }
-#ifdef HAVE_AVX2
-        const uint16_t mul16 = (uint16_t)(fc - dr[i]);
-        mulv  = _mm256_set1_epi16(mul16);
         const len_t len = dts[LENGTH];
-        const len_t os  = len % 16;
-        const hm_t * const ds  = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]  +=  mul * cfs[j];
-        }
-        for (; j < len; j += 16) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            prodh = _mm256_mulhi_epu16(mulv, redv);
-            prodl = _mm256_mullo_epi16(mulv, redv);
-            prod  = _mm256_xor_si256(
-                _mm256_and_si256(prodh, mask16h), _mm256_srli_epi32(prodl, 16));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+5]],
-                dr[ds[j+9]],
-                dr[ds[j+13]]);
-            resv  = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32));
-            _mm256_store_si256((__m256i*)(res),resv);
-            dr[ds[j+1]]   = res[0];
-            dr[ds[j+5]]   = res[1];
-            dr[ds[j+9]]   = res[2];
-            dr[ds[j+13]]  = res[3];
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+3]],
-                dr[ds[j+7]],
-                dr[ds[j+11]],
-                dr[ds[j+15]]);
-            resv  = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32));
-            _mm256_store_si256((__m256i*)(res),resv);
-            dr[ds[j+3]]   = res[0];
-            dr[ds[j+7]]   = res[1];
-            dr[ds[j+11]]  = res[2];
-            dr[ds[j+15]]  = res[3];
-            prod  = _mm256_xor_si256(
-                _mm256_slli_epi32(prodh, 16), _mm256_and_si256(prodl, mask16));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+0]],
-                dr[ds[j+4]],
-                dr[ds[j+8]],
-                dr[ds[j+12]]);
-            resv  = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32));
-            _mm256_store_si256((__m256i*)(res),resv);
-            dr[ds[j+0]]   = res[0];
-            dr[ds[j+4]]   = res[1];
-            dr[ds[j+8]]   = res[2];
-            dr[ds[j+12]]  = res[3];
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+2]],
-                dr[ds[j+6]],
-                dr[ds[j+10]],
-                dr[ds[j+14]]);
-            resv  = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32));
-            _mm256_store_si256((__m256i*)(res),resv);
-            dr[ds[j+2]]   = res[0];
-            dr[ds[j+6]]   = res[1];
-            dr[ds[j+10]]  = res[2];
-            dr[ds[j+14]]  = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
-        const len_t len = dts[LENGTH];
-        const hm_t * const ds  = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] +=  mul * cfs[j];
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   +=  mul * cfs[j];
-            dr[ds[j+1]] +=  mul * cfs[j+1];
-            dr[ds[j+2]] +=  mul * cfs[j+2];
-            dr[ds[j+3]] +=  mul * cfs[j+3];
+#ifdef __amd64__
+	if (have_avx2) {
+            const uint16_t mul16 = (uint16_t)(fc - dr[i]);
+            mulv  = _mm256_set1_epi16(mul16);
+            const len_t os  = len % 16;
+            const hm_t * const ds  = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]  +=  mul * cfs[j];
+            }
+            for (; j < len; j += 16) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                prodh = _mm256_mulhi_epu16(mulv, redv);
+                prodl = _mm256_mullo_epi16(mulv, redv);
+                prod  = _mm256_xor_si256(
+                    _mm256_and_si256(prodh, mask16h), _mm256_srli_epi32(prodl, 16));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+5]],
+                    dr[ds[j+9]],
+                    dr[ds[j+13]]);
+                resv  = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32));
+                _mm256_store_si256((__m256i*)(res),resv);
+                dr[ds[j+1]]   = res[0];
+                dr[ds[j+5]]   = res[1];
+                dr[ds[j+9]]   = res[2];
+                dr[ds[j+13]]  = res[3];
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+3]],
+                    dr[ds[j+7]],
+                    dr[ds[j+11]],
+                    dr[ds[j+15]]);
+                resv  = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32));
+                _mm256_store_si256((__m256i*)(res),resv);
+                dr[ds[j+3]]   = res[0];
+                dr[ds[j+7]]   = res[1];
+                dr[ds[j+11]]  = res[2];
+                dr[ds[j+15]]  = res[3];
+                prod  = _mm256_xor_si256(
+                    _mm256_slli_epi32(prodh, 16), _mm256_and_si256(prodl, mask16));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+0]],
+                    dr[ds[j+4]],
+                    dr[ds[j+8]],
+                    dr[ds[j+12]]);
+                resv  = _mm256_add_epi64(drv, _mm256_and_si256(prod, mask32));
+                _mm256_store_si256((__m256i*)(res),resv);
+                dr[ds[j+0]]   = res[0];
+                dr[ds[j+4]]   = res[1];
+                dr[ds[j+8]]   = res[2];
+                dr[ds[j+12]]  = res[3];
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+2]],
+                    dr[ds[j+6]],
+                    dr[ds[j+10]],
+                    dr[ds[j+14]]);
+                resv  = _mm256_add_epi64(drv, _mm256_srli_epi64(prod, 32));
+                _mm256_store_si256((__m256i*)(res),resv);
+                dr[ds[j+2]]   = res[0];
+                dr[ds[j+6]]   = res[1];
+                dr[ds[j+10]]  = res[2];
+                dr[ds[j+14]]  = res[3];
+            }
         }
+        else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds  = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] +=  mul * cfs[j];
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   +=  mul * cfs[j];
+                dr[ds[j+1]] +=  mul * cfs[j+1];
+                dr[ds[j+2]] +=  mul * cfs[j+2];
+                dr[ds[j+3]] +=  mul * cfs[j+3];
+            }
+        }
         dr[i] = 0;
     }
     if (k == 0) {
--- msolve-0.5.0/src/neogb/la_ff_32.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/neogb/la_ff_32.c	2023-07-28 11:18:29.383810853 -0600
@@ -20,7 +20,7 @@
 
 #include "data.h"
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
 #include <immintrin.h>
 #endif
 
@@ -319,7 +319,7 @@ static hm_t *reduce_dense_row_by_known_p
     const len_t ncl             = mat->ncl;
     cf32_t * const * const mcf  = mat->cf_32;
 
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4];
     __m256i redv, mulv, prodv, drv, resv;
 #endif
@@ -348,58 +348,60 @@ static hm_t *reduce_dense_row_by_known_p
         } else {
             cfs   = mcf[dts[COEFFS]];
         }
-#ifdef HAVE_AVX2
-        const len_t len = dts[LENGTH];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + OFFSET;
-        const uint32_t mul32 = (int32_t)(mod - dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]  +=  mul * cfs[j];
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_lddqu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_add_epi64(drv, prodv);
-            _mm256_storeu_si256((__m256i*)(res), resv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_add_epi64(drv, prodv);
-            _mm256_storeu_si256((__m256i*)(res), resv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
         const len_t len = dts[LENGTH];
-        const hm_t * const ds  = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] +=  mul * cfs[j];
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   +=  mul * cfs[j];
-            dr[ds[j+1]] +=  mul * cfs[j+1];
-            dr[ds[j+2]] +=  mul * cfs[j+2];
-            dr[ds[j+3]] +=  mul * cfs[j+3];
-        }
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + OFFSET;
+            const uint32_t mul32 = (int32_t)(mod - dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]  +=  mul * cfs[j];
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_lddqu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_add_epi64(drv, prodv);
+                _mm256_storeu_si256((__m256i*)(res), resv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_add_epi64(drv, prodv);
+                _mm256_storeu_si256((__m256i*)(res), resv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds  = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] +=  mul * cfs[j];
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   +=  mul * cfs[j];
+                dr[ds[j+1]] +=  mul * cfs[j+1];
+                dr[ds[j+2]] +=  mul * cfs[j+2];
+                dr[ds[j+3]] +=  mul * cfs[j+3];
+            }
+        }
         dr[i] = 0;
         st->application_nr_mult +=  len / 1000.0;
         st->application_nr_add  +=  len / 1000.0;
@@ -533,10 +535,10 @@ static hm_t *reduce_dense_row_by_known_p
     int64_t np = -1;
     const int64_t mod   = (int64_t)st->fc;
     const int64_t mod2  = (int64_t)st->fc * st->fc;
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4] __attribute__((aligned(32)));
     __m256i cmpv, redv, drv, mulv, prodv, resv, rresv;
-    __m256i zerov= _mm256_set1_epi64x(0);
+    __m256i zerov = _mm256_set1_epi64x(0);
     __m256i mod2v = _mm256_set1_epi64x(mod2);
 #endif
 
@@ -558,68 +560,70 @@ static hm_t *reduce_dense_row_by_known_p
         const int64_t mul = (int64_t)dr[i];
         dts = pivs[i];
         cfs = bs->cf_32[dts[COEFFS]];
-#ifdef HAVE_AVX2
-        const len_t len = dts[LENGTH];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + OFFSET;
-        const uint32_t mul32 = (uint32_t)(dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
         const len_t len = dts[LENGTH];
-        const hm_t * const ds = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j+1]] -=  mul * cfs[j+1];
-            dr[ds[j+2]] -=  mul * cfs[j+2];
-            dr[ds[j+3]] -=  mul * cfs[j+3];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-            dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
-            dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
-            dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
-        }
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + OFFSET;
+            const uint32_t mul32 = (uint32_t)(dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j+1]] -=  mul * cfs[j+1];
+                dr[ds[j+2]] -=  mul * cfs[j+2];
+                dr[ds[j+3]] -=  mul * cfs[j+3];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+                dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
+                dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
+                dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
+            }
+        }
         dr[i] = 0;
         st->application_nr_mult +=  len / 1000.0;
         st->application_nr_add  +=  len / 1000.0;
@@ -675,10 +679,10 @@ static hm_t *sba_reduce_dense_row_by_kno
     const int64_t mod  = (int64_t)st->fc;
     const int64_t mod2 = (int64_t)st->fc * st->fc;
     const len_t nc     = smat->nc;
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4] __attribute__((aligned(32)));
     __m256i cmpv, redv, drv, mulv, prodv, resv, rresv;
-    __m256i zerov= _mm256_set1_epi64x(0);
+    __m256i zerov = _mm256_set1_epi64x(0);
     __m256i mod2v = _mm256_set1_epi64x(mod2);
 #endif
 
@@ -712,68 +716,70 @@ static hm_t *sba_reduce_dense_row_by_kno
         dts = pivs[i];
         cfs = smat->cc32[dts[SM_CFS]];
 
-#ifdef HAVE_AVX2
-        const len_t len = dts[SM_LEN];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + SM_OFFSET;
-        const uint32_t mul32 = (uint32_t)(dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[SM_PRE];
-        const len_t len = dts[SM_LEN];
-        const hm_t * const ds = dts + SM_OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j+1]] -=  mul * cfs[j+1];
-            dr[ds[j+2]] -=  mul * cfs[j+2];
-            dr[ds[j+3]] -=  mul * cfs[j+3];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-            dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
-            dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
-            dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
-        }
+	const len_t len = dts[SM_LEN];
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + SM_OFFSET;
+            const uint32_t mul32 = (uint32_t)(dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[SM_PRE];
+            const hm_t * const ds = dts + SM_OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j+1]] -=  mul * cfs[j+1];
+                dr[ds[j+2]] -=  mul * cfs[j+2];
+                dr[ds[j+3]] -=  mul * cfs[j+3];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+                dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
+                dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
+                dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
+            }
+        }
         dr[i] = 0;
         st->application_nr_mult +=  len / 1000.0;
         st->application_nr_add  +=  len / 1000.0;
@@ -832,10 +838,10 @@ static hm_t *reduce_dense_row_by_known_p
     const len_t ncols           = mat->nc;
     const len_t ncl             = mat->ncl;
     cf32_t * const * const mcf  = mat->cf_32;
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4] __attribute__((aligned(32)));
     __m256i cmpv, redv, drv, mulv, prodv, resv, rresv;
-    __m256i zerov= _mm256_set1_epi64x(0);
+    __m256i zerov = _mm256_set1_epi64x(0);
     __m256i mod2v = _mm256_set1_epi64x(mod2);
 #endif
 
@@ -863,68 +869,70 @@ static hm_t *reduce_dense_row_by_known_p
         } else {
             cfs   = mcf[dts[COEFFS]];
         }
-#ifdef HAVE_AVX2
-        const len_t len = dts[LENGTH];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + OFFSET;
-        const uint32_t mul32 = (uint32_t)(dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
-        const len_t len = dts[LENGTH];
-        const hm_t * const ds = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j+1]] -=  mul * cfs[j+1];
-            dr[ds[j+2]] -=  mul * cfs[j+2];
-            dr[ds[j+3]] -=  mul * cfs[j+3];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-            dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
-            dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
-            dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
-        }
+	const len_t len = dts[LENGTH];
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + OFFSET;
+            const uint32_t mul32 = (uint32_t)(dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j+1]] -=  mul * cfs[j+1];
+                dr[ds[j+2]] -=  mul * cfs[j+2];
+                dr[ds[j+3]] -=  mul * cfs[j+3];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+                dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
+                dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
+                dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
+            }
+        }
         dr[i] = 0;
         st->application_nr_mult +=  len / 1000.0;
         st->application_nr_add  +=  len / 1000.0;
@@ -976,10 +984,10 @@ static hm_t *reduce_dense_row_by_known_p
     int64_t np = -1;
     const int64_t mod           = (int64_t)st->fc;
     const int64_t mod2          = (int64_t)st->fc * st->fc;
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4] __attribute__((aligned(32)));
     __m256i cmpv, redv, drv, mulv, prodv, resv, rresv;
-    __m256i zerov= _mm256_set1_epi64x(0);
+    __m256i zerov = _mm256_set1_epi64x(0);
     __m256i mod2v = _mm256_set1_epi64x(mod2);
 #endif
 
@@ -1005,127 +1013,129 @@ static hm_t *reduce_dense_row_by_known_p
         dtsm  = mulh[dts[MULT]];
         cfsm  = mulcf[dtsm[COEFFS]];
         cfs   = pivcf[dts[COEFFS]];
-#ifdef HAVE_AVX2
-        const len_t len = dts[LENGTH];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + OFFSET;
-        const uint32_t mul32 = (uint32_t)(dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-        const len_t lenm = dtsm[LENGTH];
-        const len_t osm  = lenm % 8;
-        const hm_t * const dsm  = dtsm + OFFSET;
-        /* const uint32_t mulm32 = (uint32_t)(drm[i]); */
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < osm; ++j) {
-            drm[dsm[j]] -=  mul * cfsm[j];
-            drm[dsm[j]] +=  (drm[dsm[j]] >> 63) & mod2;
-        }
-        for (; j < lenm; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfsm+j));
-            drv   = _mm256_setr_epi64x(
-                drm[dsm[j+1]],
-                drm[dsm[j+3]],
-                drm[dsm[j+5]],
-                drm[dsm[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            drm[dsm[j+1]] = res[0];
-            drm[dsm[j+3]] = res[1];
-            drm[dsm[j+5]] = res[2];
-            drm[dsm[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                drm[dsm[j]],
-                drm[dsm[j+2]],
-                drm[dsm[j+4]],
-                drm[dsm[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            drm[dsm[j]]   = res[0];
-            drm[dsm[j+2]] = res[1];
-            drm[dsm[j+4]] = res[2];
-            drm[dsm[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
-        const len_t len = dts[LENGTH];
-        const hm_t * const ds = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j+1]] -=  mul * cfs[j+1];
-            dr[ds[j+2]] -=  mul * cfs[j+2];
-            dr[ds[j+3]] -=  mul * cfs[j+3];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-            dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
-            dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
-            dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
-        }
-        const len_t osm   = dtsm[PRELOOP];
-        const len_t lenm  = dtsm[LENGTH];
-        const hm_t * const dsm = dtsm + OFFSET;
-        for (j = 0; j < osm; ++j) {
-            drm[dsm[j]] -=  mul * cfsm[j];
-            drm[dsm[j]] +=  (drm[dsm[j]] >> 63) & mod2;
-        }
-        for (; j < lenm; j += UNROLL) {
-            drm[dsm[j]]   -=  mul * cfsm[j];
-            drm[dsm[j+1]] -=  mul * cfsm[j+1];
-            drm[dsm[j+2]] -=  mul * cfsm[j+2];
-            drm[dsm[j+3]] -=  mul * cfsm[j+3];
-            drm[dsm[j]]   +=  (drm[dsm[j]] >> 63) & mod2;
-            drm[dsm[j+1]] +=  (drm[dsm[j+1]] >> 63) & mod2;
-            drm[dsm[j+2]] +=  (drm[dsm[j+2]] >> 63) & mod2;
-            drm[dsm[j+3]] +=  (drm[dsm[j+3]] >> 63) & mod2;
-        }
+	const len_t len = dts[LENGTH];
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + OFFSET;
+            const uint32_t mul32 = (uint32_t)(dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+            const len_t lenm = dtsm[LENGTH];
+            const len_t osm  = lenm % 8;
+            const hm_t * const dsm  = dtsm + OFFSET;
+            /* const uint32_t mulm32 = (uint32_t)(drm[i]); */
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < osm; ++j) {
+                drm[dsm[j]] -=  mul * cfsm[j];
+                drm[dsm[j]] +=  (drm[dsm[j]] >> 63) & mod2;
+             }
+            for (; j < lenm; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfsm+j));
+                drv   = _mm256_setr_epi64x(
+                    drm[dsm[j+1]],
+                    drm[dsm[j+3]],
+                    drm[dsm[j+5]],
+                    drm[dsm[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                drm[dsm[j+1]] = res[0];
+                drm[dsm[j+3]] = res[1];
+                drm[dsm[j+5]] = res[2];
+                drm[dsm[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    drm[dsm[j]],
+                    drm[dsm[j+2]],
+                    drm[dsm[j+4]],
+                    drm[dsm[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                drm[dsm[j]]   = res[0];
+                drm[dsm[j+2]] = res[1];
+                drm[dsm[j+4]] = res[2];
+                drm[dsm[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j+1]] -=  mul * cfs[j+1];
+                dr[ds[j+2]] -=  mul * cfs[j+2];
+                dr[ds[j+3]] -=  mul * cfs[j+3];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+                dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
+                dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
+                dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
+            }
+            const len_t osm   = dtsm[PRELOOP];
+            const len_t lenm  = dtsm[LENGTH];
+            const hm_t * const dsm = dtsm + OFFSET;
+            for (j = 0; j < osm; ++j) {
+                drm[dsm[j]] -=  mul * cfsm[j];
+                drm[dsm[j]] +=  (drm[dsm[j]] >> 63) & mod2;
+            }
+            for (; j < lenm; j += UNROLL) {
+                drm[dsm[j]]   -=  mul * cfsm[j];
+                drm[dsm[j+1]] -=  mul * cfsm[j+1];
+                drm[dsm[j+2]] -=  mul * cfsm[j+2];
+                drm[dsm[j+3]] -=  mul * cfsm[j+3];
+                drm[dsm[j]]   +=  (drm[dsm[j]] >> 63) & mod2;
+                drm[dsm[j+1]] +=  (drm[dsm[j+1]] >> 63) & mod2;
+                drm[dsm[j+2]] +=  (drm[dsm[j+2]] >> 63) & mod2;
+                drm[dsm[j+3]] +=  (drm[dsm[j+3]] >> 63) & mod2;
+            }
+        }
         dr[i] = 0;
         st->application_nr_mult +=  len / 1000.0;
         st->application_nr_add  +=  len / 1000.0;
@@ -1211,7 +1221,7 @@ static hm_t *trace_reduce_dense_row_by_k
     const len_t ncols           = mat->nc;
     const len_t ncl             = mat->ncl;
     cf32_t * const * const mcf  = mat->cf_32;
-#ifdef HAVE_AVX2
+#ifdef __amd64__
     int64_t res[4] __attribute__((aligned(32)));
     __m256i cmpv, redv, drv, mulv, prodv, resv, rresv;
     __m256i zerov = _mm256_set1_epi64x(0);
@@ -1245,68 +1255,70 @@ static hm_t *trace_reduce_dense_row_by_k
             cfs   = mcf[dts[COEFFS]];
         }
 
-#ifdef HAVE_AVX2
-        const len_t len = dts[LENGTH];
-        const len_t os  = len % 8;
-        const hm_t * const ds  = dts + OFFSET;
-        const uint32_t mul32 = (uint32_t)(dr[i]);
-        mulv  = _mm256_set1_epi32(mul32);
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += 8) {
-            redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j+1]],
-                dr[ds[j+3]],
-                dr[ds[j+5]],
-                dr[ds[j+7]]);
-            /* first four mult-adds -- lower */
-            prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j+1]] = res[0];
-            dr[ds[j+3]] = res[1];
-            dr[ds[j+5]] = res[2];
-            dr[ds[j+7]] = res[3];
-            /* second four mult-adds -- higher */
-            prodv = _mm256_mul_epu32(mulv, redv);
-            drv   = _mm256_setr_epi64x(
-                dr[ds[j]],
-                dr[ds[j+2]],
-                dr[ds[j+4]],
-                dr[ds[j+6]]);
-            resv  = _mm256_sub_epi64(drv, prodv);
-            cmpv  = _mm256_cmpgt_epi64(zerov, resv);
-            rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
-            _mm256_store_si256((__m256i*)(res), rresv);
-            dr[ds[j]]   = res[0];
-            dr[ds[j+2]] = res[1];
-            dr[ds[j+4]] = res[2];
-            dr[ds[j+6]] = res[3];
-        }
-#else
-        const len_t os  = dts[PRELOOP];
-        const len_t len = dts[LENGTH];
-        const hm_t * const ds = dts + OFFSET;
-        for (j = 0; j < os; ++j) {
-            dr[ds[j]] -=  mul * cfs[j];
-            dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
-        }
-        for (; j < len; j += UNROLL) {
-            dr[ds[j]]   -=  mul * cfs[j];
-            dr[ds[j+1]] -=  mul * cfs[j+1];
-            dr[ds[j+2]] -=  mul * cfs[j+2];
-            dr[ds[j+3]] -=  mul * cfs[j+3];
-            dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
-            dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
-            dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
-            dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
-        }
+	const len_t len = dts[LENGTH];
+#ifdef __amd64__
+        if (have_avx2) {
+            const len_t os  = len % 8;
+            const hm_t * const ds  = dts + OFFSET;
+            const uint32_t mul32 = (uint32_t)(dr[i]);
+            mulv  = _mm256_set1_epi32(mul32);
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += 8) {
+                redv  = _mm256_loadu_si256((__m256i*)(cfs+j));
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j+1]],
+                    dr[ds[j+3]],
+                    dr[ds[j+5]],
+                    dr[ds[j+7]]);
+                /* first four mult-adds -- lower */
+                prodv = _mm256_mul_epu32(mulv, _mm256_srli_epi64(redv, 32));
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j+1]] = res[0];
+                dr[ds[j+3]] = res[1];
+                dr[ds[j+5]] = res[2];
+                dr[ds[j+7]] = res[3];
+                /* second four mult-adds -- higher */
+                prodv = _mm256_mul_epu32(mulv, redv);
+                drv   = _mm256_setr_epi64x(
+                    dr[ds[j]],
+                    dr[ds[j+2]],
+                    dr[ds[j+4]],
+                    dr[ds[j+6]]);
+                resv  = _mm256_sub_epi64(drv, prodv);
+                cmpv  = _mm256_cmpgt_epi64(zerov, resv);
+                rresv = _mm256_add_epi64(resv, _mm256_and_si256(cmpv, mod2v));
+                _mm256_store_si256((__m256i*)(res), rresv);
+                dr[ds[j]]   = res[0];
+                dr[ds[j+2]] = res[1];
+                dr[ds[j+4]] = res[2];
+                dr[ds[j+6]] = res[3];
+            }
+        } else
 #endif
+        {
+            const len_t os  = dts[PRELOOP];
+            const hm_t * const ds = dts + OFFSET;
+            for (j = 0; j < os; ++j) {
+                dr[ds[j]] -=  mul * cfs[j];
+                dr[ds[j]] +=  (dr[ds[j]] >> 63) & mod2;
+            }
+            for (; j < len; j += UNROLL) {
+                dr[ds[j]]   -=  mul * cfs[j];
+                dr[ds[j+1]] -=  mul * cfs[j+1];
+                dr[ds[j+2]] -=  mul * cfs[j+2];
+                dr[ds[j+3]] -=  mul * cfs[j+3];
+                dr[ds[j]]   +=  (dr[ds[j]] >> 63) & mod2;
+                dr[ds[j+1]] +=  (dr[ds[j+1]] >> 63) & mod2;
+                dr[ds[j+2]] +=  (dr[ds[j+2]] >> 63) & mod2;
+                dr[ds[j+3]] +=  (dr[ds[j+3]] >> 63) & mod2;
+            }
+        }
         dr[i] = 0;
         st->trace_nr_mult +=  len / 1000.0;
         st->trace_nr_add  +=  len / 1000.0;
--- msolve-0.5.0/src/neogb/symbol.c.orig	2023-07-04 01:31:27.000000000 -0600
+++ msolve-0.5.0/src/neogb/symbol.c	2023-07-28 11:16:27.465530091 -0600
@@ -21,10 +21,6 @@
 
 #include "data.h"
 
-#ifdef HAVE_AVX2
-#include <immintrin.h>
-#endif
-
 /* select_all_pairs() is unused at the moment */
 #if 0 
 static void select_all_spairs(