diff --git a/cmake_targets/CMakeLists.txt b/cmake_targets/CMakeLists.txt
index 8fc2f4fd096bada25154ef2fe4c5928358431e8f..d162337ee864ae6d5b80bb30fc00ee37fd0198a3 100644
--- a/cmake_targets/CMakeLists.txt
+++ b/cmake_targets/CMakeLists.txt
@@ -134,7 +134,7 @@ else (CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7l")
     set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2")
   endif()
   if (CPUINFO MATCHES "sse4_2")
-    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2")
+    set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -mavx2 -msse4.2 -fno-tree-vectorize")
   endif()
   if (CPUINFO MATCHES "sse4_1")
     set(C_FLAGS_PROCESSOR "${C_FLAGS_PROCESSOR} -msse4.1")
diff --git a/openair1/PHY/CODING/3gpplte_sse.c b/openair1/PHY/CODING/3gpplte_sse.c
index e486c8e0b4c733b3f39a2acbbc8a94a073f623e4..ac4a1ae3536cb61491f068499902102c72e2d13c 100755
--- a/openair1/PHY/CODING/3gpplte_sse.c
+++ b/openair1/PHY/CODING/3gpplte_sse.c
@@ -223,6 +223,7 @@ char interleave_compact_byte(short * base_interleaver,unsigned char * input, uns
                                     0b00000001};
 #endif
  
+
 #ifndef __AVX2__
   if ((n&15) > 0)
     loop++;
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
index 11720087ee81e6d01c4c90c4509925201b5e6c7d..75e8eaf126cdae33f7f6700d67dd2aca9c63914f 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_avx2_16bit.c
@@ -186,12 +186,16 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
   __m256i new0,new1,new2,new3,new4,new5,new6,new7;
   __m256i alpha_max;
 
+  unsigned long long timein,timeout;
+
   l2 = L>>3;
   K1 = (frame_length>>3);
 #ifdef DEBUG_LOGMAP
   fprintf(fdavx2,"Compute alpha (avx2_16bit)\n");
   fprintf(fdavx2b,"Compute alpha (avx2_16bit)\n");
 #endif
+  timein = rdtsc_oai();
+
   for (l=K1;; l=l2,rerun_flag=1) {
     alpha128 = (__m256i *)alpha;
 
@@ -378,6 +382,9 @@ void compute_alpha16avx2(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,uint16
     if (rerun_flag==1)
       break;
   }
+  timeout = rdtsc_oai();
+  printf("alpha: inner loop time %llu\n",timeout-timein);
+
 }
 
 
@@ -386,9 +393,10 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
 
   int k,rerun_flag=0;
 
-  __m256i m11_128,m10_128;
-  __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
-  __m256i new0,new1,new2,new3,new4,new5,new6,new7;
+  __m256i *m11p,*m10p;
+  register __m256i b0,b1,b2,b3,b4,b5,b6,b7;
+  register __m256i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
+  register __m256i new0,new1,new2,new3,new4,new5,new6,new7;
 
   __m256i *beta128,*alpha128,*beta_ptr;
   __m256i beta_max;
@@ -398,6 +406,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
   llr_t beta0,beta1;
   llr_t beta0_cw2,beta1_cw2;
 
+  unsigned long long timein,timeout;
+
 #ifdef DEBUG_LOGMAP
   fprintf(fdavx2,"compute_beta (avx2_16bit), %p,%p,%p,%p,framelength %d,F %d\n",
       beta,m_11,m_10,alpha,frame_length,F);
@@ -590,56 +600,74 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
 #endif
     int loopval=((rerun_flag==0)?0:((frame_length-L)>>3));
 
+    printf("beta: rerun %d => loopval %d\n",rerun_flag,loopval);
+
+    timein = rdtsc_oai();
+
+    m11p = (frame_length>>3)-1+(__m256i*)m_11;
+    m10p = (frame_length>>3)-1+(__m256i*)m_10;
+
     for (k=(frame_length>>3)-1; k>=loopval; k--) {
 
-      m11_128=((__m256i*)m_11)[k];
-      m10_128=((__m256i*)m_10)[k];
-
-      m_b0 = _mm256_adds_epi16(beta_ptr[4],m11_128);  //m11
-      m_b1 = _mm256_subs_epi16(beta_ptr[4],m11_128);  //m00
-      m_b2 = _mm256_subs_epi16(beta_ptr[5],m10_128);  //m01
-      m_b3 = _mm256_adds_epi16(beta_ptr[5],m10_128);  //m10
-      m_b4 = _mm256_adds_epi16(beta_ptr[6],m10_128);  //m10
-      m_b5 = _mm256_subs_epi16(beta_ptr[6],m10_128);  //m01
-      m_b6 = _mm256_subs_epi16(beta_ptr[7],m11_128);  //m00
-      m_b7 = _mm256_adds_epi16(beta_ptr[7],m11_128);  //m11
-
-      new0 = _mm256_subs_epi16(beta_ptr[0],m11_128);  //m00
-      new1 = _mm256_adds_epi16(beta_ptr[0],m11_128);  //m11
-      new2 = _mm256_adds_epi16(beta_ptr[1],m10_128);  //m10
-      new3 = _mm256_subs_epi16(beta_ptr[1],m10_128);  //m01
-      new4 = _mm256_subs_epi16(beta_ptr[2],m10_128);  //m01
-      new5 = _mm256_adds_epi16(beta_ptr[2],m10_128);  //m10
-      new6 = _mm256_adds_epi16(beta_ptr[3],m11_128);  //m11
-      new7 = _mm256_subs_epi16(beta_ptr[3],m11_128);  //m00
+      
+      b4 = _mm256_load_si256(&beta_ptr[4]);
+      b5 = _mm256_load_si256(&beta_ptr[5]);
+      b6 = _mm256_load_si256(&beta_ptr[6]);
+      b7 = _mm256_load_si256(&beta_ptr[7]);
+
+      m_b0 = _mm256_adds_epi16(b4,*m11p);  //m11
+      m_b1 = _mm256_subs_epi16(b4,*m11p);  //m00
+      m_b2 = _mm256_subs_epi16(b5,*m10p);  //m01
+      m_b3 = _mm256_adds_epi16(b5,*m10p);  //m10
+      m_b4 = _mm256_adds_epi16(b6,*m10p);  //m10
+      m_b5 = _mm256_subs_epi16(b6,*m10p);  //m01
+      m_b6 = _mm256_subs_epi16(b7,*m11p);  //m00
+      m_b7 = _mm256_adds_epi16(b7,*m11p);  //m11
+
+      b0 = _mm256_load_si256(&beta_ptr[0]);
+      b1 = _mm256_load_si256(&beta_ptr[1]);
+      b2 = _mm256_load_si256(&beta_ptr[2]);
+      b3 = _mm256_load_si256(&beta_ptr[3]);
+
+      new0 = _mm256_subs_epi16(b0,*m11p);  //m00
+      new1 = _mm256_adds_epi16(b0,*m11p);  //m11
+      new2 = _mm256_adds_epi16(b1,*m10p);  //m10
+      new3 = _mm256_subs_epi16(b1,*m10p);  //m01
+      new4 = _mm256_subs_epi16(b2,*m10p);  //m01
+      new5 = _mm256_adds_epi16(b2,*m10p);  //m10
+      new6 = _mm256_adds_epi16(b3,*m11p);  //m11
+      new7 = _mm256_subs_epi16(b3,*m11p);  //m00
+
+
+      b0 = _mm256_max_epi16(m_b0,new0);
+      b1 = _mm256_max_epi16(m_b1,new1);
+      b2 = _mm256_max_epi16(m_b2,new2);
+      b3 = _mm256_max_epi16(m_b3,new3);
+      b4 = _mm256_max_epi16(m_b4,new4);
+      b5 = _mm256_max_epi16(m_b5,new5);
+      b6 = _mm256_max_epi16(m_b6,new6);
+      b7 = _mm256_max_epi16(m_b7,new7);
+
+      beta_max = _mm256_max_epi16(b0,b1);
+      beta_max = _mm256_max_epi16(beta_max   ,b2);
+      beta_max = _mm256_max_epi16(beta_max   ,b3);
+      beta_max = _mm256_max_epi16(beta_max   ,b4);
+      beta_max = _mm256_max_epi16(beta_max   ,b5);
+      beta_max = _mm256_max_epi16(beta_max   ,b6);
+      beta_max = _mm256_max_epi16(beta_max   ,b7);
 
       beta_ptr-=8;
-
-      beta_ptr[0] = _mm256_max_epi16(m_b0,new0);
-      beta_ptr[1] = _mm256_max_epi16(m_b1,new1);
-      beta_ptr[2] = _mm256_max_epi16(m_b2,new2);
-      beta_ptr[3] = _mm256_max_epi16(m_b3,new3);
-      beta_ptr[4] = _mm256_max_epi16(m_b4,new4);
-      beta_ptr[5] = _mm256_max_epi16(m_b5,new5);
-      beta_ptr[6] = _mm256_max_epi16(m_b6,new6);
-      beta_ptr[7] = _mm256_max_epi16(m_b7,new7);
-
-      beta_max = _mm256_max_epi16(beta_ptr[0],beta_ptr[1]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[2]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[3]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[4]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[5]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[6]);
-      beta_max = _mm256_max_epi16(beta_max   ,beta_ptr[7]);
-
-      beta_ptr[0] = _mm256_subs_epi16(beta_ptr[0],beta_max);
-      beta_ptr[1] = _mm256_subs_epi16(beta_ptr[1],beta_max);
-      beta_ptr[2] = _mm256_subs_epi16(beta_ptr[2],beta_max);
-      beta_ptr[3] = _mm256_subs_epi16(beta_ptr[3],beta_max);
-      beta_ptr[4] = _mm256_subs_epi16(beta_ptr[4],beta_max);
-      beta_ptr[5] = _mm256_subs_epi16(beta_ptr[5],beta_max);
-      beta_ptr[6] = _mm256_subs_epi16(beta_ptr[6],beta_max);
-      beta_ptr[7] = _mm256_subs_epi16(beta_ptr[7],beta_max);
+      m11p--;
+      m10p--;
+
+      beta_ptr[0] = _mm256_subs_epi16(b0,beta_max);
+      beta_ptr[1] = _mm256_subs_epi16(b1,beta_max);
+      beta_ptr[2] = _mm256_subs_epi16(b2,beta_max);
+      beta_ptr[3] = _mm256_subs_epi16(b3,beta_max);
+      beta_ptr[4] = _mm256_subs_epi16(b4,beta_max);
+      beta_ptr[5] = _mm256_subs_epi16(b5,beta_max);
+      beta_ptr[6] = _mm256_subs_epi16(b6,beta_max);
+      beta_ptr[7] = _mm256_subs_epi16(b7,beta_max);
 
 #ifdef DEBUG_LOGMAP
       fprintf(fdavx2,"Loop index %d, mb\n",k);
@@ -658,6 +686,8 @@ void compute_beta16avx2(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,uint16_
 
 #endif
     }
+    timeout = rdtsc_oai();
+    printf("beta: inner loop time %llu\n",timeout-timein);
 
     if (rerun_flag==1)
       break;
@@ -968,7 +998,7 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
   yp2 = yparity2;
 
 
-
+#if 0
   for (i=0; i<n; i+=8) {
     pi2_p = &pi2tab16avx2[iind][i];
 
@@ -1084,9 +1114,23 @@ unsigned char phy_threegpplte_turbo_decoder16avx2(int16_t *y,
     yp128_cw2+=3;
 
   }
-
   yp=(llr_t*)yp128;
   yp_cw2=(llr_t*)yp128_cw2;
+#else
+  
+  pi2_p    = &pi2tab16avx2[iind][0];
+  for (i=0,j=0; i<n; i++) {
+    s[*pi2_p]     = y[j];
+    s[*pi2_p+8]   = y2[j++];
+    yp1[*pi2_p]   = y[j];
+    yp1[*pi2_p+8] = y2[j++];
+    yp2[*pi2_p]   = y[j];
+    yp2[(*pi2_p++)+8] = y2[j++];
+  }    
+  yp=(llr_t*)&y[j];
+  yp_cw2=(llr_t*)&y2[j];
+#endif
+
 
   // Termination
   for (i=0; i<3; i++) {
diff --git a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
index b8abeda3ccb9ef08631a1edb28401d2312e49537..b70b1aee9550bef5559418bbfca2c734985124a0 100644
--- a/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
+++ b/openair1/PHY/CODING/3gpplte_turbo_decoder_sse_16bit.c
@@ -144,12 +144,25 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
   fprintf(fdsse4,"compute_gamma (sse_16bit), %p,%p,%p,%p,framelength %d\n",m11,m10,systematic,y_parity,frame_length);
 #endif
 
+#ifndef __AVX2__
   K1=frame_length>>3;
+#else
+  if ((frame_length&15) > 0)
+    K1=(frame_length+1)>>4;
+  else
+    K1=frame_length>>4;
+#endif
 
   for (k=0; k<K1; k++) {
 #if defined(__x86_64__) || defined(__i386__)
+#ifndef __AVX2__
     m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k],y_parity128[k]),1);
     m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k],y_parity128[k]),1);
+#else
+    ((__m256i*)m11_128)[k] = _mm256_srai_epi16(_mm256_adds_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
+    //    ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)y_parity128)[k],((__m256i*)systematic128)[k]),1);
+    ((__m256i*)m10_128)[k] = _mm256_srai_epi16(_mm256_subs_epi16(((__m256i*)systematic128)[k],((__m256i*)y_parity128)[k]),1);
+#endif
 #elif defined(__arm__)
     m11_128[k] = vhaddq_s16(systematic128[k],y_parity128[k]);
     m10_128[k] = vhsubq_s16(systematic128[k],y_parity128[k]);
@@ -164,13 +177,19 @@ void compute_gamma16(llr_t* m11,llr_t* m10,llr_t* systematic,channel_t* y_parity
 #endif
   }
 
+  k=frame_length>>3;
   // Termination
 #if defined(__x86_64__) || defined(__i386__)
   m11_128[k] = _mm_srai_epi16(_mm_adds_epi16(systematic128[k+term_flag],y_parity128[k]),1);
+  //#ifndef __AVX2__
+#if 1
   m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(systematic128[k+term_flag],y_parity128[k]),1);
+#else
+  m10_128[k] = _mm_srai_epi16(_mm_subs_epi16(y_parity128[k],systematic128[k+term_flag]),1);
+#endif
 #elif defined(__arm__)
   m11_128[k] = vhaddq_s16(systematic128[k+term_flag],y_parity128[k]);
-m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
+  m10_128[k] = vhsubq_s16(systematic128[k+term_flag],y_parity128[k]);
 #endif
 
 #ifdef DEBUG_LOGMAP
@@ -188,11 +207,21 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
 {
   int k,l,l2,K1,rerun_flag=0;
 #if defined(__x86_64__) || defined(__i386__)
-  __m128i *alpha128=(__m128i *)alpha,*alpha_ptr;
-  __m128i a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
+  __m128i *alpha128=(__m128i *)alpha,*alpha_ptr,*m11p,*m10p;
+  //#ifndef __AVX2__
+#if 1
+  __m128i a0,a1,a2,a3,a4,a5,a6,a7;
   __m128i m_b0,m_b1,m_b2,m_b3,m_b4,m_b5,m_b6,m_b7;
   __m128i new0,new1,new2,new3,new4,new5,new6,new7;
   __m128i alpha_max;
+#else
+  __m256i *alpha256=(__m256i *)alpha,*alpha_ptr256,m11,m10;
+  __m256i a01,a23,a45,a67,a02,a13,a64,a75;
+  __m256i m_b01,m_b23,m_b45,m_b67,new01,new23,new45,new67;
+  __m256i m11m10_256;
+  __m256i alpha_max;
+#endif
+
 #elif defined(__arm__)
   int16x8_t *alpha128=(int16x8_t *)alpha,*alpha_ptr;
   int16x8_t a0,a1,a2,a3,a4,a5,a6,a7,*m11p,*m10p;
@@ -208,6 +237,10 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
   for (l=K1;; l=l2,rerun_flag=1) {
 #if defined(__x86_64__) || defined(__i386__)
     alpha128 = (__m128i *)alpha;
+    //#ifdef __AVX2__
+#if 0
+    alpha256 = (__m256i *)alpha;
+#endif
 #elif defined(__arm__)
     alpha128 = (int16x8_t *)alpha;
 #endif
@@ -288,6 +321,11 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
     }
 
     alpha_ptr = &alpha128[0];
+    //#ifdef __AVX2__
+#if 0
+    alpha_ptr256 = &alpha256[0];
+#endif
+
 #if defined(__x86_64__) || defined(__i386__)
     m11p = (__m128i*)m_11;
     m10p = (__m128i*)m_10;
@@ -300,6 +338,8 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
          k++) {
 
 #if defined(__x86_64__) || defined(__i386__)
+      //#ifndef __AVX2__
+#if 1
       a1=_mm_load_si128(&alpha_ptr[1]);
       a3=_mm_load_si128(&alpha_ptr[3]);
       a5=_mm_load_si128(&alpha_ptr[5]);
@@ -344,6 +384,37 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha_max = _mm_max_epi16(alpha_max,a5);
       alpha_max = _mm_max_epi16(alpha_max,a6);
       alpha_max = _mm_max_epi16(alpha_max,a7);
+#else
+      a02=_mm256_load_si256(&alpha_ptr256[0]);
+      a13=_mm256_load_si256(&alpha_ptr256[1]);
+      a64=_mm256_load_si256(&alpha_ptr256[2]);
+      a75=_mm256_load_si256(&alpha_ptr256[3]);
+      m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m11p,0);
+      m11m10_256 = _mm256_insertf128_si256(m11m10_256,*m10p,1);
+
+
+      m_b01 = _mm256_adds_epi16(a13,m11m10_256); //negative m10
+      m_b23 = _mm256_subs_epi16(a75,m11m10_256); //negative m10
+      m_b45 = _mm256_subs_epi16(a13,m11m10_256); //negative m10
+      m_b67 = _mm256_adds_epi16(a75,m11m10_256); //negative m10
+
+      new01 = _mm256_subs_epi16(a02,m11m10_256);  //negative m10
+      new23 = _mm256_adds_epi16(a64,m11m10_256);  //negative m10
+      new45 = _mm256_adds_epi16(a02,m11m10_256);  //negative m10
+      new67 = _mm256_subs_epi16(a64,m11m10_256);  //negative m10
+
+      a01   = _mm256_max_epi16(m_b01,new01);
+      a23   = _mm256_max_epi16(m_b23,new23);
+      a45   = _mm256_max_epi16(m_b45,new45);
+      a67   = _mm256_max_epi16(m_b67,new67);
+
+      alpha_max = _mm256_max_epi16(a01,a23);
+      alpha_max = _mm256_max_epi16(alpha_max,a45);
+      alpha_max = _mm256_max_epi16(alpha_max,a67);
+      alpha_max = _mm256_max_epi16(alpha_max,_mm256_permutevar8x32_epi32(alpha_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
+      
+      
+#endif
 #elif defined(__arm__)
       m_b0 = vqaddq_s16(alpha_ptr[1],*m11p);  // m11
       m_b4 = vqsubq_s16(alpha_ptr[1],*m11p);  // m00=-m11
@@ -383,9 +454,15 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
 #endif
 
       alpha_ptr+=8;
+      //#ifdef __AVX2__
+#if 0
+      alpha_ptr256+=4;
+#endif
       m11p++;
       m10p++;
 #if defined(__x86_64__) || defined(__i386__)
+      //#ifndef __AVX2__
+#if 1
       alpha_ptr[0] = _mm_subs_epi16(a0,alpha_max);
       alpha_ptr[1] = _mm_subs_epi16(a1,alpha_max);
       alpha_ptr[2] = _mm_subs_epi16(a2,alpha_max);
@@ -394,6 +471,18 @@ void compute_alpha16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,unsigned s
       alpha_ptr[5] = _mm_subs_epi16(a5,alpha_max);
       alpha_ptr[6] = _mm_subs_epi16(a6,alpha_max);
       alpha_ptr[7] = _mm_subs_epi16(a7,alpha_max);
+#else
+
+      a01   = _mm256_subs_epi16(a01,alpha_max);
+      a23   = _mm256_subs_epi16(a23,alpha_max);
+      a45   = _mm256_subs_epi16(a45,alpha_max);
+      a67   = _mm256_subs_epi16(a67,alpha_max);
+
+      alpha_ptr256[0] = _mm256_permute2x128_si256(a01,a23,0x20);  //a02
+      alpha_ptr256[1] = _mm256_permute2x128_si256(a01,a23,0x13);  //a13
+      alpha_ptr256[2] = _mm256_permute2x128_si256(a45,a67,0x02);  //a64
+      alpha_ptr256[3] = _mm256_permute2x128_si256(a45,a67,0x31);  //a75
+#endif
 #elif defined(__arm__)
       alpha_ptr[0] = vqsubq_s16(a0,alpha_max);
       alpha_ptr[1] = vqsubq_s16(a1,alpha_max);
@@ -488,8 +577,12 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
 
   //  fprintf(fdsse4,"beta init: offset8 %d\n",offset8_flag);
   m11=(int16_t)m_11[2+frame_length];
+  //#ifndef __AVX2__
+#if 1
   m10=(int16_t)m_10[2+frame_length];
-
+#else
+  m10=-(int16_t)m_10[2+frame_length];
+#endif
 #ifdef DEBUG_LOGMAP
   fprintf(fdsse4,"m11,m10 %d,%d\n",m11,m10);
 #endif
@@ -643,6 +736,9 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       m11_128=((__m128i*)m_11)[k];
       m10_128=((__m128i*)m_10)[k];
 
+
+      //#ifndef __AVX2__
+#if 1
       m_b0 = _mm_adds_epi16(beta_ptr[4],m11_128);  //m11
       m_b1 = _mm_subs_epi16(beta_ptr[4],m11_128);  //m00
       m_b2 = _mm_subs_epi16(beta_ptr[5],m10_128);  //m01
@@ -652,6 +748,7 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       m_b6 = _mm_subs_epi16(beta_ptr[7],m11_128);  //m00
       m_b7 = _mm_adds_epi16(beta_ptr[7],m11_128);  //m11
 
+
       new0 = _mm_subs_epi16(beta_ptr[0],m11_128);  //m00
       new1 = _mm_adds_epi16(beta_ptr[0],m11_128);  //m11
       new2 = _mm_adds_epi16(beta_ptr[1],m10_128);  //m10
@@ -661,8 +758,29 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       new6 = _mm_adds_epi16(beta_ptr[3],m11_128);  //m11
       new7 = _mm_subs_epi16(beta_ptr[3],m11_128);  //m00
 
+#else
+      b01=_mm256_load_si256(&((_m256i*)beta_ptr)[0]);
+      b23=_mm256_load_si256(&((_m256i*)beta_ptr)[1]);
+      b45=_mm256_load_si256(&((_m256i*)beta_ptr)[2]);
+      b67=_mm256_load_si256(&((_m256i*)beta_ptr)[3]);
+      m11m10_256 = _mm256_insertf128_si256(m11m10_256,m11_128,0);
+      m11m10_256 = _mm256_insertf128_si256(m11m10_256,m10_128,1);
+
+
+      m_b02 = _mm256_adds_epi16(b45,m11m10_256); //negative m10
+      m_b13 = _mm256_subs_epi16(b45,m11m10_256); //negative m10
+      m_b64 = _mm256_subs_epi16(b67,m11m10_256); //negative m10
+      m_b75 = _mm256_adds_epi16(b67,m11m10_256); //negative m10
+      new02 = _mm256_subs_epi16(b01,m11m10_256);  //negative m10
+      new13 = _mm256_adds_epi16(b01,m11m10_256);  //negative m10
+      new64 = _mm256_adds_epi16(b23,m11m10_256);  //negative m10
+      new75 = _mm256_subs_epi16(b24,m11m10_256);  //negative m10
+#endif
+
       beta_ptr-=8;
 
+      //#ifndef __AVX2__
+#if 1
       beta_ptr[0] = _mm_max_epi16(m_b0,new0);
       beta_ptr[1] = _mm_max_epi16(m_b1,new1);
       beta_ptr[2] = _mm_max_epi16(m_b2,new2);
@@ -688,6 +806,28 @@ void compute_beta16(llr_t* alpha,llr_t* beta,llr_t *m_11,llr_t* m_10,unsigned sh
       beta_ptr[5] = _mm_subs_epi16(beta_ptr[5],beta_max);
       beta_ptr[6] = _mm_subs_epi16(beta_ptr[6],beta_max);
       beta_ptr[7] = _mm_subs_epi16(beta_ptr[7],beta_max);
+#else
+      b02   = _mm256_max_epi16(m_b02,new02);
+      b13   = _mm256_max_epi16(m_b13,new13);
+      b64   = _mm256_max_epi16(m_b64,new64);
+      b75   = _mm256_max_epi16(m_b75,new75);
+      
+      beta_max = _mm256_max_epi16(b02,b13);
+      beta_max = _mm256_max_epi16(beta_max,b64);
+      beta_max = _mm256_max_epi16(beta_max,b75);
+      beta_max = _mm256_max_epi16(beta_max,_mm256_permutevar8x32_epi32(betaa_max,_mm256_set_epi32(3,2,1,0,7,6,5,4)));
+
+      b02   = _mm256_subs_epi16(b02,beta_max);
+      b13   = _mm256_subs_epi16(b13,beta_max);
+      b64   = _mm256_subs_epi16(b64,beta_max);
+      b75   = _mm256_subs_epi16(b75,beta_max);
+
+      ((_m256i*)beta_ptr)[0]) = _mm256_permute2x128_si256(b02,b13,0x02);  //b01
+      ((_m256i*)beta_ptr)[1]) = _mm256_permute2x128_si256(b02,b13,0x31);  //b23
+      ((_m256i*)beta_ptr)[2]) = _mm256_permute2x128_si256(b64,b75,0x13);  //b45
+      ((_m256i*)beta_ptr)[3]) = _mm256_permute2x128_si256(b64,b75,0x20);  //b67
+#endif
+
 #elif defined(__arm__)
       m11_128=((int16x8_t*)m_11)[k];
       m10_128=((int16x8_t*)m_10)[k];
@@ -820,6 +960,9 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
       print_shorts("b6:",&beta_ptr[6]);
       print_shorts("b7:",&beta_ptr[7]);
     */
+
+    //#ifndef __AVX2__
+#if 1
     m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
     m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
     m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
@@ -836,6 +979,32 @@ void compute_ext16(llr_t* alpha,llr_t* beta,llr_t* m_11,llr_t* m_10,llr_t* ext,
     m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
     m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
     m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
+#else
+
+
+    m00_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[0]); //ALPHA_BETA_1m00;
+    m10_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[1]); //ALPHA_BETA_1m10;
+    m11_1 = _mm_adds_epi16(alpha_ptr[0],beta_ptr[4]); //ALPHA_BETA_1m11;
+    m01_1 = _mm_adds_epi16(alpha_ptr[2],beta_ptr[5]); //ALPHA_BETA_1m01;
+
+    m11_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[0]); //ALPHA_BETA_2m11;
+    m01_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[1]); //ALPHA_BETA_2m01;
+    m00_2 = _mm_adds_epi16(alpha_ptr[1],beta_ptr[4]); //ALPHA_BETA_2m00;
+    m10_2 = _mm_adds_epi16(alpha_ptr[3],beta_ptr[5]); //ALPHA_BETA_2m10;
+
+    m11_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[3]); //ALPHA_BETA_3m11;
+    m01_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[2]); //ALPHA_BETA_3m01;
+    m00_3 = _mm_adds_epi16(alpha_ptr[6],beta_ptr[7]); //ALPHA_BETA_3m00;
+    m10_3 = _mm_adds_epi16(alpha_ptr[4],beta_ptr[6]); //ALPHA_BETA_3m10;
+
+    m00_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[3]); //ALPHA_BETA_4m00;
+    m10_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[2]); //ALPHA_BETA_4m10;
+    m11_4 = _mm_adds_epi16(alpha_ptr[7],beta_ptr[7]); //ALPHA_BETA_4m11;
+    m01_4 = _mm_adds_epi16(alpha_ptr[5],beta_ptr[6]); //ALPHA_BETA_4m01;
+
+
+#endif
+
     /*
       print_shorts("m11_1:",&m11_1);
       print_shorts("m11_2:",&m11_2);
@@ -1030,19 +1199,19 @@ unsigned char phy_threegpplte_turbo_decoder16(short *y,
       n is the size in bits of the coded block, with the tail */
 
 
-  llr_t systematic0[n+16] __attribute__ ((aligned(16)));
-  llr_t systematic1[n+16] __attribute__ ((aligned(16)));
-  llr_t systematic2[n+16] __attribute__ ((aligned(16)));
-  llr_t yparity1[n+16] __attribute__ ((aligned(16)));
-  llr_t yparity2[n+16] __attribute__ ((aligned(16)));
+  llr_t systematic0[n+16] __attribute__ ((aligned(32)));
+  llr_t systematic1[n+16] __attribute__ ((aligned(32)));
+  llr_t systematic2[n+16] __attribute__ ((aligned(32)));
+  llr_t yparity1[n+16] __attribute__ ((aligned(32)));
+  llr_t yparity2[n+16] __attribute__ ((aligned(32)));
 
-  llr_t ext[n+128] __attribute__((aligned(16)));
-  llr_t ext2[n+128] __attribute__((aligned(16)));
+  llr_t ext[n+128] __attribute__((aligned(32)));
+  llr_t ext2[n+128] __attribute__((aligned(32)));
 
-  llr_t alpha[(n+16)*8] __attribute__ ((aligned(16)));
-  llr_t beta[(n+16)*8] __attribute__ ((aligned(16)));
-  llr_t m11[n+16] __attribute__ ((aligned(16)));
-  llr_t m10[n+16] __attribute__ ((aligned(16)));
+  llr_t alpha[(n+16)*8] __attribute__ ((aligned(32)));
+  llr_t beta[(n+16)*8] __attribute__ ((aligned(32)));
+  llr_t m11[n+32] __attribute__ ((aligned(32)));
+  llr_t m10[n+32] __attribute__ ((aligned(32)));
 
 
   int *pi2_p,*pi4_p,*pi5_p,*pi6_p;
diff --git a/openair1/PHY/LTE_REFSIG/lte_gold.c b/openair1/PHY/LTE_REFSIG/lte_gold.c
index 567851a51a6cd376f2cd6841782dd7fb374b402d..bfd67166408a9e3a1e777cd30ffcbd6118812a9b 100644
--- a/openair1/PHY/LTE_REFSIG/lte_gold.c
+++ b/openair1/PHY/LTE_REFSIG/lte_gold.c
@@ -61,21 +61,18 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
 
       x2 = Ncp +
            (Nid_cell<<1) +
-           (((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
+	(((1+(Nid_cell<<1))*(1 + (((frame_parms->Ncp==0)?4:3)*l) + (7*(1+ns))))<<10); //cinit
       //x2 = frame_parms->Ncp + (Nid_cell<<1) + (1+(Nid_cell<<1))*(1 + (3*l) + (7*(1+ns))); //cinit
       //n = 0
-      //      printf("cinit (ns %d, l %d) => %d\n",ns,l,x2);
       x1 = 1+ (1<<31);
       x2=x2 ^ ((x2 ^ (x2>>1) ^ (x2>>2) ^ (x2>>3))<<31);
 
       // skip first 50 double words (1600 bits)
-      //printf("n=0 : x1 %x, x2 %x\n",x1,x2);
       for (n=1; n<50; n++) {
         x1 = (x1>>1) ^ (x1>>4);
         x1 = x1 ^ (x1<<31) ^ (x1<<28);
         x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
         x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
-        //  printf("x1 : %x, x2 : %x\n",x1,x2);
       }
 
       for (n=0; n<14; n++) {
@@ -84,7 +81,6 @@ void lte_gold(LTE_DL_FRAME_PARMS *frame_parms,uint32_t lte_gold_table[20][2][14]
         x2 = (x2>>1) ^ (x2>>2) ^ (x2>>3) ^ (x2>>4);
         x2 = x2 ^ (x2<<31) ^ (x2<<30) ^ (x2<<29) ^ (x2<<28);
         lte_gold_table[ns][l][n] = x1^x2;
-        //  printf("n=%d : c %x\n",n,x1^x2);
       }
 
     }
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
index 66df9dbb78204837036b3f15d3ad8c771fea612c..801c1a4f93ce3c93657c7e742d88dcc7fb512208 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_decoding.c
@@ -446,7 +446,8 @@ uint32_t  dlsch_decoding(PHY_VARS_UE *phy_vars_ue,
     printf("\n");
     */
 
-#ifndef __AVX2__
+    //#ifndef __AVX2__
+#if 1
     if (err_flag == 0) {
 
       start_meas(dlsch_turbo_decoding_stats);
diff --git a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
index ad464b16babd2b06c64035f0a40379bf73da58fe..264a3c3610c08e9a9438a271ca3e5da225f1fc82 100644
--- a/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
+++ b/openair1/PHY/LTE_TRANSPORT/dlsch_demodulation.c
@@ -1898,17 +1898,17 @@ void dlsch_channel_compensation_TM3(LTE_DL_FRAME_PARMS *frame_parms,
 
   for (aarx=0; aarx<frame_parms->nb_antennas_rx; aarx++) {
 
-    dl_ch0_128          = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-    dl_ch1_128          = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];
+    dl_ch0_128          = (__m128i *)&dl_ch_estimates_ext[aarx][symbol*frame_parms->N_RB_DL*12];    // hr,0
+    dl_ch1_128          = (__m128i *)&dl_ch_estimates_ext[2+aarx][symbol*frame_parms->N_RB_DL*12];  // hr,1
 
 
     dl_ch_mag0_128      = (__m128i *)&dl_ch_mag0[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag0_128b     = (__m128i *)&dl_ch_magb0[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag1_128      = (__m128i *)&dl_ch_mag1[aarx][symbol*frame_parms->N_RB_DL*12];
     dl_ch_mag1_128b     = (__m128i *)&dl_ch_magb1[aarx][symbol*frame_parms->N_RB_DL*12];
-    rxdataF128          = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];
-    rxdataF_comp0_128   = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12];
-    rxdataF_comp1_128   = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12];
+    rxdataF128          = (__m128i *)&rxdataF_ext[aarx][symbol*frame_parms->N_RB_DL*12];           // yr
+    rxdataF_comp0_128   = (__m128i *)&rxdataF_comp0[aarx][symbol*frame_parms->N_RB_DL*12];         // yr,0 = yr * conj(hr,0)
+    rxdataF_comp1_128   = (__m128i *)&rxdataF_comp1[aarx][symbol*frame_parms->N_RB_DL*12];         // yr,1 = yr * conj(hr,1)
 
 
     for (rb=0; rb<nb_rb; rb++) {